<h2 style="color:black; font-style:italic;">Importing modules</h2>

In [None]:
import pandas as pd
import requests
import os   
import json  
import re as re  
from matplotlib import pyplot as plt  
import numpy as np                          

<h4 style="color:black; font-weight:600;"><a href="#gathering">Data gathering</a></h4>
<ol style="color:black">
<li><a href="#gathering_img_pred">Gathering image predictions file</a></li>
<li><a href="#gathering_tweet_json">Gathering tweet JSON file</a></li>
<li><a href="#gathering_tweet_archive">Gathering tweet archive file</a></li>
</ol>

<h4 style="color:black; font-weight:600;"><a href="#assessment">Data assessment</a></h4>
<ol style="color:black">
<li><a href="#assessment_img_pred">Assessment image predictions file</a></li>
<ul>
<li><a href="#question_1">Column names are not descriptive</a></li>
<li><a href="#question_2">The column img_num is useless</a></li>
<li><a href="#question_3">Dog names do not start with capital letters</a></li>
</ul>
<li><a href="#assessment_tweet_json">Assessment tweet JSON file</a></li>
<ul>
<li><a href="#question_4">All coordinates data are null</a></li>
<li><a href="#question_5">Tweet_source column contain HTML code</a></li>
<li><a href="#question_6">Some columns are not useful</a></li>
<li><a href="#question_7">Tweet date is not at good format</a></li>
</ul>
<li><a href="#assessment_tweet_archive">Assessment tweet archive file</a></li>
<ul>
<li><a href="#question_8">They are many null data and retweets that are not ratings</a></li>
<li><a href="#question_9">Columns names do not start with capital letters</a></li>
<li><a href="#question_10">Tweet_source column contain HTML code</a></li>
<li><a href="#question_11">Timestamp is not at good format</a></li>
<li><a href="#question_12">Doggo, Floofer, Pooper, and Puppo columns can be converted into one column</a></li>
<li><a href="#question_13">The 3 tables are part of the same dataset and  must be merged</a></li>
</ul>
</ol>

<h4 style="color:black; font-weight:600;"><a href="#cleaning">Data cleaning</a></h4>
<ol style="color:black">
<h5 style="color:black; font-weight:600;">Quality problems</h5>
<li><a href="#answer_1">Column names are not descriptive (image predictions)</a></li>
<li><a href="#answer_2">The column img_num is useless (image predictions)</a></li>
<li><a href="#answer_3">Dog names do not start with capital letters (image predictions)</a></li>
<li><a href="#answer_4">All coordinates data are null (tweet JSON)</a></li>
<li><a href="#answer_5">Tweet_source column contain HTML code (tweet JSON)</a></li>
<li><a href="#answer_6">Some columns are not useful (tweet JSON)</a></li>
<li><a href="#answer_7">Tweet date is not at good format (tweet JSON)</a></li>
<li><a href="#answer_8">They are many null data and retweets that are not ratings (tweet archive)</a></li>
<li><a href="#answer_9">Columns names do not start with capital letters (tweet archive)</a></li>
<li><a href="#answer_10">Tweet_source column contain HTML code (tweet archive)</a></li>
<li><a href="#answer_11">Timestamp is not at good format (tweet archive)</a></li>
<h5 style="color:black; font-weight:600;">Tidiness problems</h5>
<li><a href="#answer_12">Doggo, Floofer, Pooper, and Puppo columns can be converted into one column (tweet archive)</a></li>
<li><a href="#answer_13">The 3 tables are part of the same dataset and  must be merged</a></li>
</ol>

<br /><br /><br /><br /><br /><br /><br />
<hr /><hr />
<h1 style="color:black; font-style:italic; font-weight:500;" id="gathering">Gathering data</h1>
<hr /><hr />
<br /><br /><br />

<hr />
<br />
<h3 style="color:black; font-style:italic; font-weight:300;" id="gathering_img_pred">Gathering image predictions file</h3>

In [None]:
# Creating forlder
folder_name = 'tweeter_project_data'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

In [None]:
# Gathering image_prediction
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)
with open(os.path.join(folder_name,url.split('/')[-1]),mode='wb') as file:
    file.write(response.content)
os.listdir(folder_name)

df_images_predictions = pd.read_csv('./tweeter_project_data/image-predictions.tsv', sep='\t')
df_images_predictions.sample(5)

<hr />
<br />
<h3 style="color:black; font-style:italic; font-weight:300;" id="gathering_tweet_archive">Gathering tweet archive file</h3>

In [None]:
# Wrangling twitter-archive-enhanced.csv
df_tweet_archive = pd.read_csv('./tweeter_project_data/twitter-archive-enhanced.csv')
df_tweet_archive.sample(5)

<hr />
<br />
<h3 style="color:black; font-style:italic; font-weight:300;" id="gathering_tweet_json">Gathering tweet JSON file</h3>

In [None]:
# Gathering tweet_json.txt

import tweepy
from tweepy import OAuthHandler
import json
from timeit import default_timer as timer

# Query Twitter API for each tweet in the Twitter archive and save JSON in a text file
# These are hidden to comply with Twitter's API terms and conditions
consumer_key = '...'
consumer_secret = '...'
access_token = '...'
access_secret = '...'

auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit=True)

# NOTE TO STUDENT WITH MOBILE VERIFICATION ISSUES:
# df_1 is a DataFrame with the twitter_archive_enhanced.csv file. You may have to
# change line 17 to match the name of your DataFrame with twitter_archive_enhanced.csv
# NOTE TO REVIEWER: this student had mobile verification issues so the following
# Twitter API code was sent to this student from a Udacity instructor
# Tweet IDs for which to gather additional data via Twitter's API
tweet_ids = df_tweet_archive.tweet_id.values
len(tweet_ids)

# Query Twitter's API for JSON data for each tweet ID in the Twitter archive
count = 0
fails_dict = {}
start = timer()
# Save each tweet's returned JSON as a new line in a .txt file
with open('./tweeter_project_data/tweet_json.txt', 'w') as outfile:
    # This loop will likely take 20-30 minutes to run because of Twitter's rate limit
    for tweet_id in tweet_ids:
        count += 1
        print(str(count) + ": " + str(tweet_id))
        try:
            tweet = api.get_status(tweet_id, tweet_mode='extended')
            print("Success")
            json.dump(tweet._json, outfile)
            outfile.write('\n')
        except tweepy.TweepError as e:
            print("Fail")
            fails_dict[tweet_id] = e
            pass
end = timer()
print(end - start)
print(fails_dict)

In [None]:
with open('./tweeter_project_data/tweet_json.txt') as file:
    df_tweet_json = pd.DataFrame((json.loads(line) for line in file), columns = ['id', 'id_str', 'favorite_count', 'retweet_count', 'source', 'lang', 'created_at', 'coordinates'])
df_tweet_json.columns = ['Tweet_id', 'Tweet_id_str', 'Favorites', 'Retweets', 'Tweet_source', 'Language', 'Tweet_date', 'Coordinates']
#df_api.set_index('tweet_id', inplace = True)
df_tweet_json.sample(5)

<br /><br /><br /><br /><br /><br /><br />
<hr /><hr />
<h1 style="color:black; font-style:italic; font-weight:500;" id="assessment">Accessing Data</h1>
<hr /><hr />
<br /><br /><br />

<h2 style="color:black; font-style:italic; font-weight:400;">Image-predictions</h2>
<hr />
<h3 style="color:black; font-style:italic; font-weight:300;">Let's find the problems about this dataset</h3>
<br />

In [None]:
df_images_predictions.sample(5)

In [None]:
df_images_predictions.shape

In [None]:
df_images_predictions.info()

In [None]:
df_images_predictions.describe()

<h4 style="color:black; font-weight:600;" id="question_1">1-) Column names are not descriptive</h4>
<p>Column names do not allow you to know what data each column contains. <br> It will be necessary to rename the columns so that they are more explicit.</p>

In [None]:
df_images_predictions.columns.tolist()

<h4 style="color:black; font-weight:600;" id="question_2">2-) The column img_num is useless</h4>
<p>The column img_num doesn't give useful information. <br /> It ought to be removed.</p>

In [None]:
df_images_predictions.img_num.unique().tolist()

<h4 style="color:black; font-weight:600;" id="question_3">3-) Dog names do not start with capital letters</h4>
<ul style="color:black">
<li>malinois instead of Malinois</li>
<li>etc.</li>
</ul>

In [None]:
df_images_predictions.p2.unique().tolist()

<br />
<h2 style="color:black; font-style:italic; font-weight:400;">Tweet Json</h2>
<hr />
<h3 style="color:black; font-style:italic; font-weight:300;">Let's find the problems about this dataset</h3>
<br />

In [None]:
df_tweet_json.sample(5)

In [None]:
df_tweet_json.info()

<h4 style="color:black; font-weight:600;" id="question_4">4-) All coordinates data are null</h4>
<p>The coordinates data are all null. <br> Because of the column useless, this column has to be removed.</p>

In [None]:
df_tweet_json.Coordinates.notnull().sum()

<h4 style="color:black; font-weight:600;" id="question_5">5-) Tweet_source column contain HTML code</h4>
<p>The column Tweet_source tell us about the source of the tweet.<br /> But this column come with HTML tag. <br> It will be nice to just get the string inside the html tag.</>

In [None]:
df_tweet_json.Tweet_source.unique().tolist()

<h4 style="color:black; font-weight:600;" id="question_6">6-) Some columns are not useful</h4>
<p>The column Tweet_id and Tweet_id_str give us the same but in diferent format.</br> So, we can just have on column instead of two.</p>

In [None]:
df_tweet_json[['Tweet_id','Tweet_id_str']].sample(5)

<h4 style="color:black; font-weight:600;" id="question_7">7-) Tweet date is not at good format</h4>
<p>The column Tweet_date must be in date format, but it come with object form. </br> It will be better to turn it into date format.</p>

In [None]:
df_tweet_json.Tweet_date.info()

<br />
<h2 style="color:black; font-style:italic; font-weight:400;">Tweet Archive</h2>
<hr />
<h3 style="color:black; font-style:italic; font-weight:300;">Let's find the problems about this dataset</h3>
<br />

In [None]:
df_tweet_archive.info()

In [None]:
df_tweet_archive.sample(5)

<h4 style="color:black; font-weight:600;" id="question_8">8-) They are many null data and retweets that are not ratings</h4>
<p>The dataset contain many undifined data</p>

In [None]:
df_tweet_archive.isna().sum()

<h4 style="color:black; font-weight:600;" id="question_9">9-) Columns names do not start with capital letters</h4>

In [None]:
df_tweet_archive.columns.tolist()

<h4 style="color:black; font-weight:600;" id="question_10">10-) Tweet_source column contain HTML code</h4>
<p>The column source tell us about the source of the tweet.<br /> But this column come with HTML tag. <br> It will be nice to just get the string inside the html tag.</>

In [None]:
df_tweet_archive.source.sample(5).unique()

<h4 style="color:black; font-weight:600;" id="question_11">11-) Timestamp is not at good format</h4>
<p>The column Timestamp must be in date format, but it come with object form. </br> It will be better to turn it into date format.</p>

In [None]:
df_tweet_archive.timestamp.info()

<h4 style="color:black; font-weight:600;" id="question_12">12-) Doggo, Floofer, Pooper, and Puppo columns can be converted into one column</h4>

In [None]:
print(df_tweet_archive.doggo.unique())
print(df_tweet_archive.floofer.unique())
print(df_tweet_archive.pupper.unique())
print(df_tweet_archive.puppo.unique())

<br />
<h2 style="color:black; font-style:italic; font-weight:400;">About all dataset</h2>
<hr />
<h3 style="color:black; font-style:italic; font-weight:300;">Let's find the problems about this dataset</h3>
<br />

<h4 style="color:black; font-weight:600;" id="question_13">13-) The 3 tables are part of the same dataset and  must be merged</h4>

In [None]:
print('The length of df_images_predictions is:',len(df_images_predictions))
print('The length of df_tweet_json is:',len(df_tweet_json))
print('The length of df_tweet_archive is:',len(df_tweet_archive))

<h4 style="color:black; font-weight:600;"><a href="#cleaning">These are all problem detected</a></h4>
<ol style="color:black">
<li><a href="#answer_1">Column names are not descriptive (image predictions)</a></li>
<li><a href="#answer_2">The column img_num is useless (image predictions)</a></li>
<li><a href="#answer_3">Dog names do not start with capital letters (image predictions)</a></li>
<li><a href="#answer_4">All coordinates data are null (tweet JSON)</a></li>
<li><a href="#answer_5">Tweet_source column contain HTML code (tweet JSON)</a></li>
<li><a href="#answer_6">Some columns are not useful (tweet JSON)</a></li>
<li><a href="#answer_7">Tweet date is not at good format (tweet JSON)</a></li>
<li><a href="#answer_8">They are many null data and retweets that are not ratings (tweet archive)</a></li>
<li><a href="#answer_9">Columns names do not start with capital letters (tweet archive)</a></li>
<li><a href="#answer_10">Tweet_source column contain HTML code (tweet archive)</a></li>
<li><a href="#answer_11">Timestamp is not at good format (tweet archive)</a></li>
<li><a href="#answer_12">Doggo, Floofer, Pooper, and Puppo columns can be converted to boolean values (tweet archive)</a></li>
<li><a href="#answer_13">The 3 tables are part of the same dataset and  must be merged</a></li>
</ol>

<br /><br /><br /><br /><br /><br /><br />
<hr /><hr /><hr />
<h1 style="color:black; font-style:italic; font-weight:500;" id="cleaning">Cleaning Data</h1>
<hr /><hr /><hr />
<br /><br /><br />

<h2 style="color:black; font-style:italic; font-weight:400;" id="cleaning_img_pred">Image-predictions</h2>
<hr />
<h3 style="color:black; font-style:italic; font-weight:300;">Let's resolve the problems about this dataset</h3>
<br />

In [None]:
#Let copy the dataframe
new_df_images_predictions = df_images_predictions.copy()
new_df_images_predictions.sample(5)

<h4 style="color:black; font-weight:600;" id="answer_1">1-) Column names are not descriptive</h4>

<p><b>Define</b></p>
<p>We will rename the column names to be more explicit.</p>

<p><b>Code</b></p>

In [None]:
#Cleaning
new_df_images_predictions.rename(columns={'tweet_id':'Tweet_id','jpg_url':'Image_link','img_num':'Image_number', 'p1':'First_prediction','p1_conf':'First_prediction_reliability','p1_dog':'First_prediction_truthfulness','p2':'Second_prediction','p2_conf':'Second_prediction_reliability','p2_dog':'Second_prediction_truthfulness','p3':'Third_prediction','p3_conf':'Third_prediction_reliability','p3_dog':'Third_prediction_truthfulness'}, inplace=True)

<p><b>Test</b></p>

In [None]:
#Test
new_df_images_predictions.columns.tolist()

<h4 style="color:black; font-weight:600;" id="answer_2">2-) The column Image_Number is useless</h4>

<p><b>Define</b></p>
<p>Let create a function that remove a dataframe's column</p>

<p><b>Code</b></p>

In [None]:
#Cleaning

def drop_column(dataframe,column):
    return dataframe.drop([column],axis=1,inplace=True)

#Let drop the column Image_number from our dataset
drop_column(new_df_images_predictions,'Image_number')

<p><b>Test</b></p>

In [None]:
#Test
if('Image_number' not in new_df_images_predictions.columns.tolist()):
    print("The column Image_number doesn't exist")

<h4 style="color:black; font-weight:600;" id="answer_3">3-) Dog name do not start with capital letters</h4>

<p><b>Define</b></p>
<p>Let create a function to capitalize the fisrt letter of each row of a column</p>

<p><b>Cleaning</b></p>

In [None]:
#Cleaning

def capital(dog_name):
    return dog_name.capitalize()

#Let create an array that contain the names of the column that we want to capitalize each row string
predictions = ['First_prediction','Second_prediction','Third_prediction']

#Let make a for loop to do the capitalize action quickly
for prediction in predictions:
    new_df_images_predictions[prediction] = new_df_images_predictions[prediction].apply(lambda x: capital(x))

<p><b>Test</b></p>

In [None]:
new_df_images_predictions[['First_prediction','Second_prediction','Third_prediction']].sample(5)

<br />
<h2 style="color:black; font-style:italic; font-weight:400;">Tweet Json</h2>
<hr />
<h3 style="color:black; font-style:italic; font-weight:300;">Let's resolve the problems about this dataset</h3>
<br />

In [None]:
#Let copy the dataframe
new_df_tweet_json = df_tweet_json.copy()
new_df_tweet_json.sample(5)

<h4 style="color:black; font-weight:600;" id="answer_4">4-) All coordinates data are null</h4>

<p><b>Define</b></p>
<p>Coordinate data are useless. So, we will drop this column</p>

<p><b>Cleaning</b></p>

In [None]:
#Cleaning

drop_column(new_df_tweet_json, 'Coordinates')

<p><b>Test</b></p>

In [None]:
#Test

def check_col(table,name):
    if name in table.columns:
        print(name," exixt in this table")
    else:
        print(name," doesn't exixt  in this table")

check_col(new_df_tweet_json,'Coordinates')

<h4 style="color:black; font-weight:600;" id="answer_5">5-) Tweet_source column contain HTML code</h4>

<p><b>Define</b></p>
<p>Let remove html tag from tweet_source</p>

<p><b>Cleaning</b></p>

In [None]:
#Cleaning

#Let create a function that will remove html tag from string 
def remove_tags(Tweet_source):
    return re.sub('<.*?>','',Tweet_source)

new_df_tweet_json.Tweet_source = new_df_tweet_json.Tweet_source.apply(lambda x: remove_tags(x))

<p><b>Test</b></p>

In [None]:
#Test

new_df_tweet_json.Tweet_source.unique()

<h4 style="color:black; font-weight:600;" id="answer_6">6-) Some columns are not useful</h4>

<p><b>Define</b></p>
<p>The column Tweet_id and Tweet_id_str give us the same but in diferent format.</br> So, we can just have on column instead of two.</p>

<p><b>Cleaning</b></p>

In [None]:
#Cleaning

drop_column(new_df_tweet_json,'Tweet_id_str')

<p><b>Test</b></p>

In [None]:
#Test

check_col(new_df_tweet_json,'Tweet_id_str')

<h4 style="color:black; font-weight:600;" id="answer_7">7-) Tweet date is not at good format</h4>

<p><b>Define</b></p>
<p>The column Tweet_date must be in date format, but it come with object form. </br> It will be better to turn it into date format.</p>

<p><b>Cleaning</b></p>

In [None]:
#Cleaning

new_df_tweet_json.Tweet_date = pd.to_datetime(new_df_tweet_json.Tweet_date)

<p><b>Test</b></p>

In [None]:
#Test

new_df_tweet_json.Tweet_date.info()

<br />
<h2 style="color:black; font-style:italic; font-weight:400;">Tweet Archive</h2>
<hr />
<h3 style="color:black; font-style:italic; font-weight:300;">Let's resolve the problems about this dataset</h3>
<br />

In [None]:
#Let copy the dataframe
new_df_tweet_archive = df_tweet_archive.copy()
new_df_tweet_archive.sample(3)

<h4 style="color:black; font-weight:600;" id="answer_8">8-) They are many null data and retweets that are not ratings</h4>

<p><b>Define</b></p>
<p>The dataset contain many undifined data</p>
<p>We are going to delete retweets rows and after useless columns</p>

<p><b>Cleaning</b></p>

In [None]:
def dropNullRow(table,columns):
    for column in columns:
        table.drop(table[table[column].notnull()].index, inplace=True)

dropNullRow(new_df_tweet_archive,['retweeted_status_id','in_reply_to_status_id','in_reply_to_user_id'])
new_df_tweet_archive.info()

In [None]:
#Cleaning

#Let delete useless column
drop_column(new_df_tweet_archive,'in_reply_to_status_id')
drop_column(new_df_tweet_archive,'in_reply_to_user_id')
drop_column(new_df_tweet_archive,'retweeted_status_id')
drop_column(new_df_tweet_archive,'retweeted_status_user_id')
drop_column(new_df_tweet_archive,'retweeted_status_timestamp')
drop_column(new_df_tweet_archive,'expanded_urls')

<p><b>Test</b></p>

In [None]:
#Test

check_col(new_df_tweet_archive,'in_reply_to_status_id')
check_col(new_df_tweet_archive,'in_reply_to_user_id')
check_col(new_df_tweet_archive,'retweeted_status_id')
check_col(new_df_tweet_archive,'retweeted_status_user_id')
check_col(new_df_tweet_archive,'retweeted_status_timestamp')
check_col(new_df_tweet_archive,'expanded_urls')

<h4 style="color:black; font-weight:600;" id="answer_9">9-) Columns names do not start with capital letters</h4>

<p><b>Define</b></p>
<p>The dataset's column name do not start with capital letter</p>

<p><b>Cleaning</b></p>

In [None]:
#Cleaning

new_df_tweet_archive.rename(columns={'tweet_id':'Tweet_id','timestamp':'Timestamp','source':'Tweet_source', 'text':'Tweet_text','rating_numerator':'Rating_numerator','rating_denominator':'Rating_denominator','name':'Dog_name','doggo':'Is_doggo','floofer':'Is_floofer','pupper':'Is_pupper','puppo':'Is_puppo'}, inplace=True)
new_df_tweet_archive.sample(5)

<p><b>Test</b></p>

In [None]:
#Test

new_df_tweet_archive.columns.tolist()

<h4 style="color:black; font-weight:600;" id="answer_10">10-) Tweet_source column contain HTML code</h4>

<p><b>Define</b></p>
<p>The column source tell us about the source of the tweet.<br /> But this column come with HTML tag. <br> It will be nice to just get the string inside the html tag.</>

<p><b>Cleaning</b></p>

In [None]:
#Cleaning

#Let remove html tag from string
new_df_tweet_archive.Tweet_source = new_df_tweet_archive.Tweet_source.apply(lambda x: remove_tags(x))

<p><b>Test</b></p>

In [None]:
#Test

new_df_tweet_archive.Tweet_source.unique()

<h4 style="color:black; font-weight:600;" id="answer_11">11-) Timestamp is not at good format</h4>

<p><b>Define</b></p>
<p>The column Timestamp must be in date format, but it come with object form. </br> It will be better to turn it into date format.</p>

<p><b>Cleaning</b></p>

In [None]:
#Cleaning

#Let convert Tweet_date to date format
new_df_tweet_archive.Timestamp = pd.to_datetime(new_df_tweet_archive.Timestamp)

<p><b>Test</b></p>

In [None]:
#Test

new_df_tweet_archive.Timestamp.info()

<h4 style="color:black; font-weight:600;" id="answer_12">12-) Doggo, Floofer, Pooper, and Puppo can be converted into one column</h4>

<p><b>Define</b></p>
<p>The column Timestamp must be in date format, but it come with object form. </br> It will be better to turn it into date format.</p>

<p><b>Cleaning</b></p>

In [None]:
def changeColumnValue(dataframe,column,old_value,new_value):
    dataframe[column].replace(old_value,new_value, inplace=True)
    return dataframe

columns = ['Is_doggo','Is_floofer', 'Is_pupper', 'Is_puppo']
for column in columns:
    changeColumnValue(new_df_tweet_archive,column,'None',np.nan)

new_df_tweet_archive['DogBreed'] = new_df_tweet_archive[['Is_doggo','Is_floofer','Is_pupper','Is_puppo']].apply(
    lambda x: ','.join(x.dropna()),
    axis=1
)

new_df_tweet_archive.drop(['Is_doggo','Is_floofer','Is_pupper','Is_puppo'], axis=1, inplace=True)
changeColumnValue(new_df_tweet_archive,'DogBreed','','None')

<p><b>Test</b></p>

In [None]:

new_df_tweet_archive.DogBreed.unique()

<br />
<h2 style="color:black; font-style:italic; font-weight:400;">About all dataset</h2>
<hr />
<h3 style="color:black; font-style:italic; font-weight:300;">Let's resolve the problems about this dataset</h3>
<br />

<h4 style="color:black; font-weight:600;" id="answer_13">13-) The 3 tables are part of the same dataset and  must be merged</h4>

In [None]:
print(len(new_df_images_predictions))
print(len(new_df_tweet_json))
print(len(new_df_tweet_archive))

<p><b>Cleaning</b></p>

In [None]:
#Cleaning

#Let's merge the tables with common value
new_general_tweet_dataset = pd.merge(new_df_images_predictions,new_df_tweet_json,on='Tweet_id')
new_general_tweet_dataset = pd.merge(new_general_tweet_dataset,new_df_tweet_archive,on='Tweet_id')
new_general_tweet_dataset.sample(5)

<p><b>Test</b></p>

In [None]:
#Test

new_general_tweet_dataset.sample(5)

In [None]:
#Let store new dataset to csv file

new_general_tweet_dataset.to_csv('twitter_archive_master.csv', index=False)

<br /><br /><br /><br /><br /><br /><br />
<hr /><hr /><hr />
<h1 style="color:black; font-style:italic; font-weight:500;">Data vizualisation</h1>
<hr /><hr /><hr />
<br /><br /><br />

In [None]:
new_general_tweet_dataset.columns.tolist()

In [None]:
new_general_tweet_dataset

In [None]:
new_general_tweet_dataset['Tweet_year'] = new_general_tweet_dataset['Tweet_date'].dt.year

In [None]:
year = new_general_tweet_dataset.Tweet_date

<h4 style="color:black; font-weight:600;">Let vizualize the number of retweet for each year</h4>

In [None]:
new_general_tweet_dataset.groupby('Tweet_year')['Retweets'].sum().plot.pie(legend = True)
plt.title('Number of retweet for each year')

<h4 style="color:black; font-weight:600;">Let vizualize the number of favorite for each year</h4>

In [None]:
new_general_tweet_dataset.groupby('Tweet_year')['Favorites'].sum().plot.pie(legend = True)
plt.title('Number of favorite for each year')

<h4 style="color:black; font-weight:600;">Let vizualize the reliable prediction</h4>
<p>The most reliable prediction is the first prediction</p>

In [None]:
new_general_tweet_dataset.sample(1)

In [None]:
new_general_tweet_dataset[['First_prediction_reliability','Second_prediction_reliability','Third_prediction_reliability']].mean().plot.bar(rot=0, figsize=[10,7])
plt.title('Number of retweet for each year')
plt.xlabel('Predictions')
plt.ylabel('Reliability')

<h4 style="color:black; font-weight:600;">Let vizualize the most tweet source used</h4>
<p>The most used tweet source is Iphone</p>

In [None]:
#Check if the columns Tweet_source_x and Tweet_source_y are the same
len(new_general_tweet_dataset) == len(new_general_tweet_dataset[new_general_tweet_dataset.Tweet_source_x==new_general_tweet_dataset.Tweet_source_y])

In [None]:
new_general_tweet_dataset['Tweet_source'] = new_general_tweet_dataset.Tweet_source_x
drop_column(new_general_tweet_dataset,'Tweet_source_x')
drop_column(new_general_tweet_dataset,'Tweet_source_y')

In [None]:
new_general_tweet_dataset.groupby('Tweet_source')['Tweet_id'].count().plot.bar(rot=0)
plt.title('Number of tweet from each type of device')
plt.xlabel('Type of device')
plt.ylabel('Number of tweet from device')

In [None]:
new_general_tweet_dataset.shape