# Twitter Data Wrangling

## Gather

In [1]:
import pandas as pd
import numpy as np
import requests as rq
import os
import tweepy 
import matplotlib as mtp
import json
import datetime

# three data sources to collect tweetes :
# 1- twitter-archive-enhanced.csv
# 2- image-predictions.tsv URL : we will user requests to get it  
# 3- tweets_json : as I failed to get access for twitter API I will parse the tweets data from the  tweets_json URL ans store it in text file tweets_json.txt 

tsv_url ='https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
json_url = 'https://video.udacity-data.com/topher/2018/November/5bf60cc8_tweet-json/tweet-json'

csv_file = 'twitter-archive-enhanced.csv'
# output files 
tsv_file = 'image-predictions.tsv'
json_file = 'tweet_json.txt'
# Loadig CSV data into data frame
print ("--------------Loading CSV DataFrame.--------------")
df_csv= pd.read_csv(csv_file)

# get the tsv file from tsv URL and save it into file 
print ("--------------Loading TSV DataFrame.--------------")
response = rq.get(tsv_url)
with open(tsv_file,'wb') as f:
    f.write(response.content)
df_tsv= pd.read_csv(tsv_file,sep='\t')
# get the json file from json URL ,parse ,and save it into file 
print ("--------------Loading JSON DataFrame.--------------")
response = rq.get(json_url)
data = []
for line in response.iter_lines():
    data.append(json.loads(line))
df_json = pd.DataFrame(data)
# Saving the result to local file 
df_json.to_csv(json_file,sep=' ', index=False, header=False)
print ("--------------Loading DataFrames is Done .--------------")

--------------Loading CSV DataFrame.--------------
--------------Loading TSV DataFrame.--------------
--------------Loading JSON DataFrame.--------------
--------------Loading DataFrames is Done .--------------


## Assess

### exploring Tweets data in the CSV frames data 

In [2]:
df_csv.info()
df_csv.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


### Assess rating_numerator and rating_denominator

In [3]:
df_csv.loc[(df_csv.rating_numerator > 15)]['rating_numerator'].value_counts()
df_csv.loc[(df_csv.rating_denominator != 10)]['rating_denominator'].value_counts()

50     3
11     3
20     2
80     2
90     1
40     1
130    1
2      1
70     1
7      1
110    1
170    1
120    1
15     1
16     1
150    1
0      1
Name: rating_denominator, dtype: int64

### Asses if there are records for retweets 

In [4]:
df_csv.loc[( pd.isnull(df_csv.retweeted_status_id))].shape[0]


2175

### exploring Tweets data in the TSV frames data 

In [5]:
#df_tsv.set_index(['p1_dog','p2_dog','p3_dog']) 
#df_tsv.loc[(df_tsv['p1_dog'] == False  )].loc[(df_tsv['p2_dog'] == False )].loc[(df_tsv['p3_dog'] == False )]

df_tsv.info()
df_tsv.head()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   int64  
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


### exploring Tweets data in the JSON frames data 

In [6]:
df_json.info()
df_json.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2354 entries, 0 to 2353
Data columns (total 31 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   created_at                     2354 non-null   object 
 1   id                             2354 non-null   int64  
 2   id_str                         2354 non-null   object 
 3   full_text                      2354 non-null   object 
 4   truncated                      2354 non-null   bool   
 5   display_text_range             2354 non-null   object 
 6   entities                       2354 non-null   object 
 7   extended_entities              2073 non-null   object 
 8   source                         2354 non-null   object 
 9   in_reply_to_status_id          78 non-null     float64
 10  in_reply_to_status_id_str      78 non-null     object 
 11  in_reply_to_user_id            78 non-null     float64
 12  in_reply_to_user_id_str        78 non-null     o

Unnamed: 0,created_at,id,id_str,full_text,truncated,display_text_range,entities,extended_entities,source,in_reply_to_status_id,...,favorite_count,favorited,retweeted,possibly_sensitive,possibly_sensitive_appealable,lang,retweeted_status,quoted_status_id,quoted_status_id_str,quoted_status
0,Tue Aug 01 16:23:56 +0000 2017,892420643555336193,892420643555336193,This is Phineas. He's a mystical boy. Only eve...,False,"[0, 85]","{'hashtags': [], 'symbols': [], 'user_mentions...","{'media': [{'id': 892420639486877696, 'id_str'...","<a href=""http://twitter.com/download/iphone"" r...",,...,39467,False,False,False,False,en,,,,
1,Tue Aug 01 00:17:27 +0000 2017,892177421306343426,892177421306343426,This is Tilly. She's just checking pup on you....,False,"[0, 138]","{'hashtags': [], 'symbols': [], 'user_mentions...","{'media': [{'id': 892177413194625024, 'id_str'...","<a href=""http://twitter.com/download/iphone"" r...",,...,33819,False,False,False,False,en,,,,
2,Mon Jul 31 00:18:03 +0000 2017,891815181378084864,891815181378084864,This is Archie. He is a rare Norwegian Pouncin...,False,"[0, 121]","{'hashtags': [], 'symbols': [], 'user_mentions...","{'media': [{'id': 891815175371796480, 'id_str'...","<a href=""http://twitter.com/download/iphone"" r...",,...,25461,False,False,False,False,en,,,,
3,Sun Jul 30 15:58:51 +0000 2017,891689557279858688,891689557279858688,This is Darla. She commenced a snooze mid meal...,False,"[0, 79]","{'hashtags': [], 'symbols': [], 'user_mentions...","{'media': [{'id': 891689552724799489, 'id_str'...","<a href=""http://twitter.com/download/iphone"" r...",,...,42908,False,False,False,False,en,,,,
4,Sat Jul 29 16:00:24 +0000 2017,891327558926688256,891327558926688256,This is Franklin. He would like you to stop ca...,False,"[0, 138]","{'hashtags': [{'text': 'BarkWeek', 'indices': ...","{'media': [{'id': 891327551943041024, 'id_str'...","<a href=""http://twitter.com/download/iphone"" r...",,...,41048,False,False,False,False,en,,,,


#### Assess retweet , and favourite info from the data  


In [7]:
print(df_json.loc[(df_json.favorited == True)].shape[0])
print(df_json.loc[(df_json.favorite_count > 0)].shape[0])
print(df_json.loc[(df_json.retweeted == True)].shape[0])
print(df_json.loc[(df_json.retweet_count > 0)].shape[0])


8
2175
0
2353


## **Data Quality Issues**:
### **Validity**:
#### - Tweet Data in **df_CSV**: 'timestamp'  is formatted as string not datetime
#### - Tweet Data in **df_JSON**: 'created_at'  is formatted as string not datetime
### Accuracy:
#### - Tweet Data in **df_CSV**: rating_numerator below 15 is 2330 , 26 records found to have odd values like 420 , 72 ,.... 
#### - Tweet Data in **df_CSV**: rating_denominator should be 10 , but only 2333 records found to be 10 , 23 records found to have odd values like 0 , 170,.... found 
#### - Tweet Data in **df_CSV**: data includes retweets and replys not only original tweets 
#### - Tweet Data in df_CSV: doggo, floofer, pupper ,puppo  variable have zero null values , however  String 'None' is used instead .     
### Consistancy:
#### - Tweet Data in **df_JSON** :  has 0 records with retweeted = True , however 2353 records found to have retweet_count > 0
#### - Tweet Data in **df_JSON** :  has 8 records with favorited = True , however 2175 records found to have favorite_count > 0
### Data Tidiness Issues:
#### - Tweet Data in **df_CSV**: Values of dog_stages 'doggo', 'floofer', 'pupper', 'puppo' are represented as Variable 
#### - Tweet Data in **df_TSV**: p1, p2,p3 are three columns for the same variable for the breed that
#### - Tweet Data in **df_TSV**: p1_conf, p2_conf,p3_conf are three columns for the same variable for the breed that
#### - Tweet Data in **df_TSV**: p1_dog, p2_dog,p3_dog are three columns for the same variable for the breed that
#### - Tweet Data in **df_JSON**: retweet is represented in df_csv.retweeted_status_id and df_json.retweeted

## Clean


### Define
#### **Validity**:
#### - Tweet Data in **df_CSV**: format 'timestamp' as datetime
#### - Tweet Data in **df_JSON**: format 'created_at'  as datetime

#### Code

In [8]:
df_csv_clean = df_csv.copy()
df_tsv_clean = df_tsv.copy()
df_json_clean = df_json.copy()
df_csv_clean['timestamp'] = pd.to_datetime(df_csv_clean.timestamp)
df_json_clean['created_at'] = pd.to_datetime(df_json_clean.created_at)

#### Test

In [9]:

df_csv_clean.info()
df_json_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype              
---  ------                      --------------  -----              
 0   tweet_id                    2356 non-null   int64              
 1   in_reply_to_status_id       78 non-null     float64            
 2   in_reply_to_user_id         78 non-null     float64            
 3   timestamp                   2356 non-null   datetime64[ns, UTC]
 4   source                      2356 non-null   object             
 5   text                        2356 non-null   object             
 6   retweeted_status_id         181 non-null    float64            
 7   retweeted_status_user_id    181 non-null    float64            
 8   retweeted_status_timestamp  181 non-null    object             
 9   expanded_urls               2297 non-null   object             
 10  rating_numerator            2356 non-null   int64           

### Define
#### **Accuracy**:
#### -Tweet Data in df_CSV: rating_numerator above 15 and rating_denominator != 10 are invalid , records to be deleted :
#### Solution : delete records having rating_numerator above 15 or rating_denominator != 10 
#### - Tweet Data in df_CSV: data includes retweets and replys
#### solution : delete records 'having retweeted_status_id'  or 'in_reply_to_status_id'  not null 
#### - Tweet Data in df_CSV: doggo, floofer, pupper ,puppo  variable have zero null values , however  String 'None' is used instead .     
#### solution : Clean doggo, floofer, pupper ,puppo  variable (the last four columns ) by replcing 'None' with ''


### Code


In [10]:
df= df_csv_clean.drop(df_csv_clean[df_csv_clean['rating_denominator'] != 10].index, inplace= True)
df= df_csv_clean.drop(df_csv_clean[df_csv_clean['rating_numerator'] >15].index, inplace= True)

df= df_csv_clean.drop(df_csv_clean[pd.notnull(df_csv_clean['retweeted_status_id'])].index, inplace= True)
df= df_csv_clean.drop(df_csv_clean[pd.notnull(df_csv_clean['in_reply_to_status_id'])].index, inplace= True)
# Clean doggo, floofer, pupper ,puppo  variable (the last four columns )
df_csv_clean.iloc[:, -5:  ] = df_csv_clean.iloc[:, -5:].replace('None','')








  

### Test


In [11]:
print (df_csv_clean.loc[( df_csv_clean.rating_denominator != 10)].shape[0])
print (df_csv_clean.loc[( df_csv_clean.rating_numerator >15)].shape[0])
print (df_csv_clean.loc[(  pd.notnull(df_csv_clean.retweeted_status_id))].shape[0])
print (df_csv_clean.loc[(  pd.notnull(df_csv_clean.in_reply_to_status_id))].shape[0])
print (df_csv_clean.loc[( df_csv_clean.doggo =='None')].shape[0])
print (df_csv_clean.loc[( df_csv_clean.floofer =='None')].shape[0])
print (df_csv_clean.loc[( df_csv_clean.pupper =='None')].shape[0])
print (df_csv_clean.loc[( df_csv_clean.puppo =='None')].shape[0])



0
0
0
0
0
0
0
0


### Define
#### **Consistancy**:
#### - Tweet Data in df_JSON :  has 0 records with retweeted = True , however 2353 records found to have retweet_count > 0
#### - Tweet Data in df_JSON :  has 8 records with favorited = True , however 2175 records found to have favorite_count > 0
#### Solution : remove retweeted variable and favorited column and use retweet_count and favorite_count to judge if the tweet is favirited or retweeted 


### Code

In [12]:
df_json_clean.drop(['retweeted','favorited'], axis=1, inplace=True)

### Test 

In [13]:
df_json_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2354 entries, 0 to 2353
Data columns (total 29 columns):
 #   Column                         Non-Null Count  Dtype              
---  ------                         --------------  -----              
 0   created_at                     2354 non-null   datetime64[ns, UTC]
 1   id                             2354 non-null   int64              
 2   id_str                         2354 non-null   object             
 3   full_text                      2354 non-null   object             
 4   truncated                      2354 non-null   bool               
 5   display_text_range             2354 non-null   object             
 6   entities                       2354 non-null   object             
 7   extended_entities              2073 non-null   object             
 8   source                         2354 non-null   object             
 9   in_reply_to_status_id          78 non-null     float64            
 10  in_reply_to_status_id_st

### Define
#### **DataTideness**:
####- Tweet Data in df_CSV: Values of dog_stages 'doggo', 'floofer', 'pupper', 'puppo' are represented as Variable 
#### Solution : 
####        - Add dog_type by concatenation  'doggo', 'floofer', 'pupper', 'puppo' 
####        - recurseviely clean the '-' 
####        - Finally drop the columns
	

### Code

In [14]:
# Add dog_type by concatenation 
df_csv_clean['dog_type'] = df_csv_clean.doggo +'-'+ df_csv_clean.floofer+'-' + df_csv_clean.pupper +'-'+ df_csv_clean.puppo
# recurseviely clean the '---' , '-' , Starting and ending '-'
df_csv_clean['dog_type']= df_csv_clean['dog_type'].str.replace('---','-')
df_csv_clean['dog_type']= df_csv_clean['dog_type'].str.replace('--','-')
df_csv_clean['dog_type']= df_csv_clean['dog_type'].str.strip(to_strip='-')
# Finally drop the columns 
df_csv_clean.drop(columns=['doggo', 'floofer', 'pupper', 'puppo'], inplace=True)

In [15]:
### Test

In [16]:
df_csv_clean['dog_type'].value_counts()
df_csv_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2075 entries, 0 to 2355
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype              
---  ------                      --------------  -----              
 0   tweet_id                    2075 non-null   int64              
 1   in_reply_to_status_id       0 non-null      float64            
 2   in_reply_to_user_id         0 non-null      float64            
 3   timestamp                   2075 non-null   datetime64[ns, UTC]
 4   source                      2075 non-null   object             
 5   text                        2075 non-null   object             
 6   retweeted_status_id         0 non-null      float64            
 7   retweeted_status_user_id    0 non-null      float64            
 8   retweeted_status_timestamp  0 non-null      object             
 9   expanded_urls               2072 non-null   object             
 10  rating_numerator            2075 non-null   int64           

In [17]:
### Define
#### **DataTideness**:
### - Tweet Data in **df_TSV**: p1, p2,p3 are three columns for the same variable for the breed that
#### - Tweet Data in **df_TSV**: p1_conf, p2_conf,p3_conf are three columns for the same variable for the breed that
#### - Tweet Data in **df_TSV**: p1_dog, p2_dog,p3_dog are three columns for the same variable for the breed that
####  Solution:
####        - rename columns to fit with pd.wide_to_long()
####        - convert Variables into values using pd.wide_to_long()

In [18]:
### Code

In [19]:
# rename columns to fit with pd.wide_to_long()
df_tsv_clean = df_tsv.copy()
cols = ['tweet_id', 'jpg_url', 'img_num', 
       'predict_1', 'conf_1', 'isdog_1',
       'predict_2', 'conf_2', 'isdog_2',
       'predict_3', 'conf_3', 'isdog_3']
df_tsv_clean.columns = cols
# Use Wide to Long 
df_tsv_clean = pd.wide_to_long(df_tsv_clean, stubnames=['predict', 'conf', 'isdog'],i=['tweet_id', 'jpg_url', 'img_num'], j='predict-index', sep="_").reset_index()



In [20]:
### Test

In [21]:
df_tsv_clean.info()
df_tsv_clean.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6225 entries, 0 to 6224
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tweet_id       6225 non-null   int64  
 1   jpg_url        6225 non-null   object 
 2   img_num        6225 non-null   int64  
 3   predict-index  6225 non-null   int64  
 4   predict        6225 non-null   object 
 5   conf           6225 non-null   float64
 6   isdog          6225 non-null   bool   
dtypes: bool(1), float64(1), int64(3), object(2)
memory usage: 298.0+ KB


Unnamed: 0,tweet_id,jpg_url,img_num,predict-index,predict,conf,isdog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,1,Welsh_springer_spaniel,0.465074,True
1,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,2,collie,0.156665,True
2,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,3,Shetland_sheepdog,0.061428,True
3,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,1,redbone,0.506826,True
4,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,2,miniature_pinscher,0.074192,True


### Define
#### **DataTideness**:
#### - Tweet Data as a single ovservational unit is not is a single table : Tweet Data is douplicated over df_JSON and df_CSV .
#### Solution :
####    merge 'retweet_count','favorite_count' data into df_CSV  using tweet_id   

In [22]:
### Code
#Select the fields for join'id','retweet_count','favorite_count'
df1 =df_json[['id','retweet_count','favorite_count']]
# Make a left join
df_csv_clean = df_csv_clean.merge(df1,how='left',left_on=['tweet_id'],right_on=['id'])
# drop the extra id column used in join 
df_csv_clean.drop(['id'], axis= 1, inplace= True)


### Test


In [23]:
df_csv_clean.info()
df_csv_clean.head()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 2075 entries, 0 to 2074
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype              
---  ------                      --------------  -----              
 0   tweet_id                    2075 non-null   int64              
 1   in_reply_to_status_id       0 non-null      float64            
 2   in_reply_to_user_id         0 non-null      float64            
 3   timestamp                   2075 non-null   datetime64[ns, UTC]
 4   source                      2075 non-null   object             
 5   text                        2075 non-null   object             
 6   retweeted_status_id         0 non-null      float64            
 7   retweeted_status_user_id    0 non-null      float64            
 8   retweeted_status_timestamp  0 non-null      object             
 9   expanded_urls               2072 non-null   object             
 10  rating_numerator            2075 non-null   int64           

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,dog_type,retweet_count,favorite_count
0,892420643555336193,,,2017-08-01 16:23:56+00:00,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,8853,39467
1,892177421306343426,,,2017-08-01 00:17:27+00:00,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,6514,33819
2,891815181378084864,,,2017-07-31 00:18:03+00:00,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,4328,25461
3,891689557279858688,,,2017-07-30 15:58:51+00:00,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,8964,42908
4,891327558926688256,,,2017-07-29 16:00:24+00:00,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,9774,41048


In [None]:
### Finally store the data frames for Tweets and image prediction 

In [24]:
df_csv_clean.to_csv('Twitter.csv')
df_tsv_clean.to_csv('image-prediction.csv')