## Import

In [1]:
import pandas as pd
import numpy as np
import requests
import io
import ast
import json

## Gathering data
### Image predictions

In [2]:
r = requests.get('https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv').content 
image_predictions = pd.read_csv(io.StringIO(r.decode('utf-8')),sep='\t')
image_predictions.sample(1)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
1325,756998049151549440,https://pbs.twimg.com/media/CoFlsGAWgAA2YeV.jpg,4,golden_retriever,0.678555,True,Labrador_retriever,0.072632,True,Border_terrier,0.049033,True


### Twitter archive enhanced

In [3]:
twitter_enhanced=pd.read_csv('data/twitter-archive-enhanced.csv')
twitter_enhanced.head(1)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,


### Twitter JSON

In [4]:
tdata=[]
file=open('data/tweet-json.txt')
for lin in file:
    tdata.append(json.loads(lin))
df_twit_json=pd.DataFrame(data=tdata)
df_twit_json.columns

Index(['created_at', 'id', 'id_str', 'full_text', 'truncated',
       'display_text_range', 'entities', 'extended_entities', 'source',
       'in_reply_to_status_id', 'in_reply_to_status_id_str',
       'in_reply_to_user_id', 'in_reply_to_user_id_str',
       'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place',
       'contributors', 'is_quote_status', 'retweet_count', 'favorite_count',
       'favorited', 'retweeted', 'possibly_sensitive',
       'possibly_sensitive_appealable', 'lang', 'retweeted_status',
       'quoted_status_id', 'quoted_status_id_str', 'quoted_status'],
      dtype='object')

### Merge in one master DF

##### Lets merge df_twit_json with twitter_enhanced, so first im going to change the column name in df_twit_json, id to tweet_id, just like in the twitter_enhanced dataframe.

In [5]:
df_twit_json.rename(columns={'id': 'tweet_id'}, inplace=True)
df_twit_json_to_merge=df_twit_json[['tweet_id','favorite_count','retweet_count','id_str']].copy()
df=pd.merge(left=twitter_enhanced,right=df_twit_json_to_merge, how='left', left_on='tweet_id',right_on='tweet_id')
df.head(1)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo,favorite_count,retweet_count,id_str
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,,39467.0,8853.0,892420643555336193


#### Now lets merge with the Image predictions

In [6]:
df=pd.merge(left=df,right=image_predictions, how='left', left_on='tweet_id',right_on='tweet_id')

In [7]:
df.columns
df.to_excel("data/df_full.xlsx")

## Assesing Data

### Just for remember
##### Completeness: do we have all of the records that we should? Do we have missing records or not? Are there specific rows, columns, or cells missing?
##### Validity: we have the records, but they're not valid, i.e., they don't conform to a defined schema. A schema is a defined set of rules for data. These rules can be real-world constraints (e.g. negative height is impossible) and table-specific constraints (e.g. unique key constraints in tables).
##### Accuracy: inaccurate data is wrong data that is valid. It adheres to the defined schema, but it is still incorrect. Example: a patient's weight that is 5 lbs too heavy because the scale was faulty.
##### Consistency: inconsistent data is both valid and accurate, but there are multiple correct ways of referring to the same thing. Consistency, i.e., a standard format, in columns that represent the same data across tables and/or within tables is desired

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2356 entries, 0 to 2355
Data columns (total 31 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

In [9]:
df.favorite_count.isna().sum()

2

In [10]:
df.p1.isna().sum()

281

In [11]:
df.expanded_urls.duplicated().sum()

137

In [12]:
df.rating_numerator.sort_values(ascending=False)

979     1776
313      960
189      666
188      420
2074     420
        ... 
2335       1
605        1
2038       1
1016       0
315        0
Name: rating_numerator, Length: 2356, dtype: int64

### QUALITY

- column tweet_id is int64, we require to change to string
- data on column timestamp have +0000 we need to delete that caracters
- column timestamp is an object
- we need to delete the retweets, we can look the retweeted_status_id column
- There are a lot of errors on ratings, we need to check them
- There are some columns to delete, they dont bring more information
- We are going to delete rows with no Image predictions
- there are 2 rows with no tweets counts, we need to delete them

### TIDY

- 'tweet_id' and 'id_str' columns are the same, we need to remove one column
- The image predictions 

# Cleaning Data

### Quality

##### 1 Define: column tweet_id is int64, we require to change type to string

##### 1 Code:

In [13]:
df.tweet_id=df.tweet_id.astype('str')

##### 1 Test:

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2356 entries, 0 to 2355
Data columns (total 31 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   object 
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

##### 2 Define: data on column timestamp have +0000 we need to delete that caracters

##### 2Code:

In [15]:
df=df
df.timestamp=df.timestamp.astype('str')
df.timestamp = df.timestamp.map(lambda x: str(x)[:-6])

##### 2 Test:

In [16]:
df.timestamp.sample(2)

1724    2015-12-24 18:00:19
1446    2016-02-08 00:27:39
Name: timestamp, dtype: object

##### 3 Define: Column timestamp is an object, we need to change the type to time

##### 3 Code:

In [17]:
df.timestamp = pd.to_datetime(df['timestamp'])

##### 3 Test:

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2356 entries, 0 to 2355
Data columns (total 31 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   tweet_id                    2356 non-null   object        
 1   in_reply_to_status_id       78 non-null     float64       
 2   in_reply_to_user_id         78 non-null     float64       
 3   timestamp                   2356 non-null   datetime64[ns]
 4   source                      2356 non-null   object        
 5   text                        2356 non-null   object        
 6   retweeted_status_id         181 non-null    float64       
 7   retweeted_status_user_id    181 non-null    float64       
 8   retweeted_status_timestamp  181 non-null    object        
 9   expanded_urls               2297 non-null   object        
 10  rating_numerator            2356 non-null   int64         
 11  rating_denominator          2356 non-null   int64       

##### 4 Define: We need to delete the retweets, we can look the retweeted_status_id column, and then delete it, they going to be empty.

##### 4 Code:

In [19]:
print('Nan count: ',df.retweeted_status_id.isna().sum())
print('df old shape: ',df.shape)
df=df[df['retweeted_status_id'].isnull()]
print('Nan count after delete retweets: ',df.retweeted_status_id.isna().sum())
print('df new shape: ',df.shape)

Nan count:  2175
df old shape:  (2356, 31)
Nan count after delete retweets:  2175
df new shape:  (2175, 31)


##### 4 Test:

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2175 entries, 0 to 2355
Data columns (total 31 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   tweet_id                    2175 non-null   object        
 1   in_reply_to_status_id       78 non-null     float64       
 2   in_reply_to_user_id         78 non-null     float64       
 3   timestamp                   2175 non-null   datetime64[ns]
 4   source                      2175 non-null   object        
 5   text                        2175 non-null   object        
 6   retweeted_status_id         0 non-null      float64       
 7   retweeted_status_user_id    0 non-null      float64       
 8   retweeted_status_timestamp  0 non-null      object        
 9   expanded_urls               2117 non-null   object        
 10  rating_numerator            2175 non-null   int64         
 11  rating_denominator          2175 non-null   int64       

##### 5 Define: There are some columns to delete, they dont bring more information (in_reply_to_status_id, in_reply_to_user_id,source,retweeted_status_id, retweeted_status_user_id retweeted_status_timestamp)

##### 5 Code:

In [21]:
df.drop(columns=['in_reply_to_status_id', 'in_reply_to_user_id','source','retweeted_status_id', 'retweeted_status_user_id', 'retweeted_status_timestamp'],inplace=True)


##### 5 Test:

In [22]:
print('df new shape: ',df.shape)

df new shape:  (2175, 25)


##### 6 Define: We are going to delete rows with no Image predictions

##### 6 Code:

In [23]:
df.dropna(subset=['jpg_url'],inplace=True)

##### 6 Test:

In [24]:
print('df new shape: ',df.shape)
print('null values: ',df.jpg_url.isna().sum())

df new shape:  (1994, 25)
null values:  0


##### 7 Define: There are errors on ratings, E.g. wrong data rating, 5 and it should be 13.5

##### 7 Code:

In [28]:
df2['text'].str.extract(pat ='(([\d]*[/][1][0])|([^.a-z][\d]*[.][\d]*[/][\d]*)|([\d]+[/][\d]*))')


Unnamed: 0,0,1,2,3
0,13/10,13/10,,
1,13/10,13/10,,
2,12/10,12/10,,
3,13/10,13/10,,
4,12/10,12/10,,
...,...,...,...,...
2351,5/10,5/10,,
2352,6/10,6/10,,
2353,9/10,9/10,,
2354,7/10,7/10,,


In [29]:
df2[['data_to_extract']]

KeyError: "None of [Index(['data_to_extract'], dtype='object')] are in the [columns]"

In [30]:
df2=df.copy()
pd.set_option("display.max_colwidth", -1)
df2['data_to_extract']=df2['text'].str.extract(pat ='(([\d]*[/][1][0])|([^.a-z][\d]*[.][\d]*[/][\d]*)|([\d]+[/][\d]*))')
df2.data_to_extract=df2.data_to_extract.astype('str')
#df2[['data_to_extract','rating_numerator','text']].sort_values(by='data_to_extract')

numbers=df2["data_to_extract"].str.split("/", n = 1, expand = True)
df2['rating_numerator_v2']=numbers[0]
df2['rating_denominator_v2']=numbers[1]
#df2.drop(columns =["data_to_extract"], inplace = True)
#df2.iloc[[1712]]

  pd.set_option("display.max_colwidth", -1)


ValueError: Wrong number of items passed 4, placement implies 1

In [None]:
print(df.shape)
print(df2.)

##### 7 Test:

##### 8 Define: there are 2 rows with no tweets counts, we need to delete them

##### 8 Code:

##### 8 Test: