Load the needed libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')     # This prevents displays of warnings which can be a distruction to viewing outputs
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import STOPWORDS,WordCloud
from gensim import corpora
import pandas as pd
import statistics
import string
import os
import re
import json
from textblob import TextBlob
import emoji

## Exploratory Data Analysis

The data was downloaded from twitter and saved as a json file

Creating a function to read and load the data form the json file

In [2]:
def read_json(json_file: str)->list:
    """
    json file reader to open and read json files into a list
    Args:
    -----
    json_file: str - path of a json file
    
    Returns
    -------
    length of the json file and a list of json
    """
    
    tweets_data = []
    for tweets in open(json_file,'r'):
        tweets_data.append(json.loads(tweets))
    
    
    return len(tweets_data), tweets_data

Reading and loading the data with <b>read_json()</b> function

In [3]:
data_source = "./data/covid19.json"

# reading the data and putting the total number of entries (tweet_len) and data (tweet_list) in variables
tweet_len, tweet_list = read_json(data_source)

Creating a class and methods to extract the tweets and create a pandas dataframe from them

In [4]:
class TweetDfExtractor:
    """
    this function will parse tweets json into a pandas dataframe
    
    Return
    ------
    dataframe
    """
    def __init__(self, tweets_list):
        
        self.tweets_list = tweets_list

    # an example function
    def find_statuses_count(self)->list:
        statuses_count = [i['user']['statuses_count'] for i in self.tweets_list]
        return statuses_count 
        
    def find_full_text(self)->list:
        text = []
        for i in self.tweets_list:
            if 'retweeted_status' in i.keys():
                if 'extended_tweet' in i['retweeted_status'].keys():
                    text.append(i['retweeted_status']['extended_tweet']['full_text'])
                else:
                    text.append(i['text'])
            else:
                if 'extended_tweet' in i.keys():
                    text.append(i['extended_tweet']['full_text'])
                else:
                    text.append(i['text'])
        return text
       
    
    def find_sentiments(self, text)->list:
        polarity = [TextBlob(i).polarity for i in text]
        self.subjectivity = [TextBlob(i).subjectivity for i in text]
        return polarity, self.subjectivity

    def find_created_time(self)->list:
        created_at = [i['created_at'] for i in self.tweets_list]
        return created_at

    def find_source(self)->list:
        source = [i['source'] for i in self.tweets_list]
        return source

    def find_screen_name(self)->list:
        screen_name =[i['user']['screen_name'] for i in self.tweets_list]
        return screen_name

    def find_followers_count(self)->list:
        followers_count = [i['user']['followers_count'] for i in self.tweets_list]
        return followers_count

    def find_friends_count(self)->list:
        friends_count = [i['user']['friends_count'] for i in self.tweets_list]
        return friends_count

    def is_sensitive(self)->list:
        is_sensitive = []
        for i in self.tweets_list:
            try:
                is_sensitive.append(i['possibly_sensitive'])
            except KeyError:
                is_sensitive.append(None)
        return is_sensitive

    def find_favourite_count(self)->list:
        favorite_count = []
        for i in self.tweets_list:
            if 'retweeted_status' in i.keys():
                favorite_count.append(i['retweeted_status']['favorite_count'])
            else:
                favorite_count.append(i['favorite_count'])
        return favorite_count
    
    def find_retweet_count(self)->list:
        retweet_count = []
        for i in self.tweets_list:
            if 'retweeted_status' in i.keys():
                retweet_count.append(i['retweeted_status']['retweet_count'])
            else:
                retweet_count.append(i['retweet_count'])
        return retweet_count

    def find_hashtags(self)->list:
        hashtags = [i['entities']['hashtags'] for i in self.tweets_list]
        return hashtags

    def find_mentions(self)->list:
        mentions = [i['entities']['user_mentions'] for i in self.tweets_list]
        return mentions

    def find_location(self)->list:
        try:
            location = [i['user']['location'] for i in self.tweets_list]
        except TypeError:
            location = ''
        
        return location

    def find_lang(self)->list:
        lang = [i['lang'] for i in self.tweets_list]
        return lang
    
            
    def get_tweet_df(self, save=False)->pd.DataFrame:
        
        columns = ['created_at', 'source', 'original_text','polarity','subjectivity', 'lang', 'favorite_count', 'retweet_count', 
            'original_author', 'followers_count','friends_count','possibly_sensitive', 'hashtags', 'user_mentions', 'place']
        
        created_at = self.find_created_time()
        source = self.find_source()
        text = self.find_full_text()
        polarity, subjectivity = self.find_sentiments(text)
        lang = self.find_lang()
        fav_count = self.find_favourite_count()
        retweet_count = self.find_retweet_count()
        screen_name = self.find_screen_name()
        follower_count = self.find_followers_count()
        friends_count = self.find_friends_count()
        sensitivity = self.is_sensitive()
        hashtags = self.find_hashtags()
        mentions = self.find_mentions()
        location = self.find_location()
        data = zip(created_at, source, text, polarity, subjectivity, lang, fav_count, retweet_count, screen_name, follower_count, friends_count, sensitivity, hashtags, mentions, location)
        df = pd.DataFrame(data=data, columns=columns)

        if save:
            df.to_csv('processed_tweet_data.csv', index=False)
            print('File Successfully Saved.!!!')
        
        return df


Using the <b>TweetDfExtractor</b> class to create a pandas dataframe

In [5]:
tweet = TweetDfExtractor(tweet_list)    # creates an instance of TweetDfExtractor
tweet_df = tweet.get_tweet_df()         # creates pandas dataframe using get_tweet_df method of TweetDfExtractor

In [138]:
class TweetDfDataPreparation:
    """
    this function will prepare tweets data form tweet dataframe modelling and visualization
    
    Return
    ------
    dataframe
    """
    def __init__(self, tweets_df):
        
        self.tweets_df = tweets_df
        
    def print_df_info(self) -> None:
        """
        this function will print the info of the tweets datafame

        Return
        ------
        None
        """
        #save the number of columns and names
        col_info = 'The number of colum(s): {}.\nThe column(s) is/are : {} and {}'.format(len(self.tweets_df.columns),','.join(self.tweets_df.columns[:-2]), self.tweets_df.columns[-1])  
        
        #save the number of rows
        num_rows = "\nThe total number of rows: {}".format(len(self.tweets_df))
        
        #save the number of duplicate tweets
        num_dup_tweets = '\nThe number of duplicate tweets: {}'.format(len(self.tweets_df)-len(self.tweets_df.original_text.unique()))
        
        na_cols = self.tweets_df.columns[self.tweets_df.isnull().any()]
        
        #save the number of missing values
        num_na_cols = "\nThe number of columns having missing value(s): {}".format(len(na_cols))
        
        #save the columns with missing value and the num of values missing
        na_cols_num_na = ''
        
        for col in na_cols:
            na_cols_num_na += "\nThe number of rows with missing value(s) in [{}]: {}".format(col, self.tweets_df[col].isnull().sum())
        
        # save the total number of missing values
        tot_na = "\nThe total number of missing value(s): {}".format(self.tweets_df.isnull().sum().sum())
        
        print(col_info, num_rows, num_dup_tweets, num_na_cols, na_cols_num_na, tot_na)
        
        
    def slice_dataframe(self, columns=['created_at', 'original_text', 'polarity', 'subjectivity'],output=True)->pd.DataFrame:
        """
        this function will slice of the tweets datafame. it takes a list of columns to slice and a bolean, output. 
        If its True it returns cleaned tweet

        Return
        ------
        dataframe if output=True, None if output=False
        """
        #sliced_tweet_df = self.tweets_df[columns]
        self.sliced_tweet_df = self.tweets_df[columns]
        if output:
            return self.sliced_tweet_df
        return None
    
    def drop_tweet_dup(self, column_name='original_text',output=True)->pd.DataFrame:
        """
        this function will drop duplicates tweets in slicedtweet datafame. 
        it takes the name of column with the tweets in string format as an argument and 
        a bolean, output. If its True it returns cleaned tweet

        Return
        ------
        dataframe if output=True, None if output=False
        """
        sliced_tweet_df = self.sliced_tweet_df
        sliced_tweet_df.drop_duplicates([column_name], inplace=True)
        self.sliced_tweet_df = sliced_tweet_df
        
        if output:
            return self.sliced_tweet_df
        return None
        
        
    def clean_tweet(self, column_name='original_text', cleaned_tweet_column_name='cleaned_tweet', output=True)->pd.DataFrame:
        """
        this function will clean tweets in slicedtweet datafame. 
        it takes the name of column with the tweets and that of the new column for the cleaned tweet 
        both in string format as an argument and a bolean, output. If its True it returns cleaned tweet

        Return
        ------
        dataframe if output=True, None if output=False
        """
        unwanted = ["'\n',''"]
        take_out = ''
        for char in unwanted:
            take_out = string.punctuation + char 
        
        def remove_punct_and_clean(tweet)->str:
            # removes emojis
            tweet = emoji.get_emoji_regexp().sub(r'', tweet)
            # removes punctuations and newline characters
            tweet  = "".join([char for char in tweet if char not in take_out])
            # removes digits
            tweet = re.sub('[0-9]+', '', tweet)
            # converts to lowercase
            tweet = tweet.lower()
            return tweet
        
        sliced_tweet_df = self.sliced_tweet_df
        sliced_tweet_df[cleaned_tweet_column_name] = sliced_tweet_df[column_name].apply(remove_punct_and_clean)
        self.sliced_tweet_df = sliced_tweet_df
        if output:
            return self.sliced_tweet_df
        return None
        
    def convert_to_datetime(self, column_name='created_at', output=True)->pd.DataFrame:
        """
        this function will convert a parsed column in sliced tweet datafame to datetime. 
        it takes the name of column with the dates in string format as an argument and a bolean, output. 
        If its True it returns dataframe with the converted dates

        Return
        ------
        dataframe if output=True, None if output=False
        """
        sliced_tweet_df = self.sliced_tweet_df
        sliced_tweet_df[column_name] = pd.to_datetime(sliced_tweet_df[column_name])
        self.sliced_tweet_df = sliced_tweet_df

        if output:
            return self.sliced_tweet_df
        return None
    
    def classify_polarity(self, column_name='polarity', cassified_column_name='classified_polarity', output=True)->pd.DataFrame:
        """
        this function will classify a pared column in sliced tweet datafame with polarity. 
        it takes the name of column with the polarity and that of the new column for the classified polarityscores 
        both in string format as an argument and a bolean, output. If its True it returns a dataframe with the classified column added

        Return
        ------
        dataframe if output=True, None if output=False
        """
        
        def classify(value)->str:
            """
            this function will classify numbers. it takes the number be calssified as an argument
            
            Return
            --------
            string
            """
            if value > 0.05:
                return 'positive'
            elif value < -0.05:
                return 'negative'
            else:
                return 'neutral'
            
        sliced_tweet_df = self.sliced_tweet_df
        sliced_tweet_df[cassified_column_name] = sliced_tweet_df[column_name].apply(classify)
        self.sliced_tweet_df = sliced_tweet_df
        if output:
            return self.sliced_tweet_df
        return None
            
        

In [123]:
def get_cleaned_tweet(uncleaned_tweet_df)->pd.DataFrame:
    prep = TweetDfDataPreparation(uncleaned_tweet_df)
    prep.slice_dataframe(output=False)
    prep.drop_tweet_dup(output=False)
    prep.convert_to_datetime(output=False)
    prep.classify_polarity(output=False)
    cleaned = prep.clean_tweet()
    
    return cleaned

def get_df_info(df)->None:
    prep = TweetDfDataPreparation(df)
    prep.print_df_info()

In [139]:
get_df_info(tweet_df)
clened_tweet = get_cleaned_tweet(tweet_df)

The number of colum(s): 15.
The column(s) is/are : created_at,source,original_text,polarity,subjectivity,lang,favorite_count,retweet_count,original_author,followers_count,friends_count,possibly_sensitive,hashtags and place 
The total number of rows: 6532 
The number of duplicate tweets: 4237 
The number of columns having missing value(s): 2 
The number of rows with missing value(s) in [possibly_sensitive]: 5014
The number of rows with missing value(s) in [place]: 2444 
The total number of missing value(s): 7458


In [141]:
clened_tweet['subjectivity'].describe()

count    2295.000000
mean        0.434792
std         0.285942
min         0.000000
25%         0.194765
50%         0.480000
75%         0.700000
max         1.000000
Name: subjectivity, dtype: float64

Viewing the first 5 lines of the created dataframe

Finding the information about the dataframe

In [None]:
tweet_df['c'] = tweet_df['original_text'].apply(remove_punct_and_clean)

In [136]:
tweet_df['original_text'].unique()

array(['🚨Africa is "in the midst of a full-blown third wave" of coronavirus, the head of @WHOAFRO has warned\n\nCases have risen across the continent by more than 20% and deaths have also risen by 15% in the last week\n\n@jriggers reports ~ 🧵\nhttps://t.co/CRDhqPHFWM',
       'Dr Moeti is head of WHO in Africa, and one of the best public health experts and leaders I know. Hers is a desperate request for vaccines to Africa. We plead with Germany and the UK to lift patent restrictions and urgently transfer technology to enable production in Africa. https://t.co/sOgIroihOc',
       "Thank you @research2note for creating this amazing campaign &amp; turning social media #red4research today. @NHSRDFORUM is all about sharing the talent, passion  &amp; commitment of individuals coming together as a community for the benefit of all. You've done this. Well done 👋",
       ...,
       'I urge all the people of #India to take #Covid19 seriously and take your #vaccines as it’s the only way to beat 

Displaying the information about null values

From the unformation displayed about the data, it was seen that there are 6532 entries in all and 2 columns out of the 15 with missing values. The number of missing values in these rows as compared to the total number of entries suggest that those columns will not have significant number of entries for analysis. Therefore the 2 columns were droped.

The chosen columns were created_at, original_text, hashtags and user_mentions. The original_text was be cleaned to create the cleaned_text.

### Data Selection

In [None]:
# selection of data for analysis
selected_df = tweet_df[['created_at', 'original_text', 'hashtags', 'user_mentions','place']]

### Data preparation

In [None]:
print('There are {} unique entries of "tweets" (original_text column) out of {} entries'
      .format(len(selected_df.original_text.unique()), tweet_len))

The information derived from the unique entries of tweet indicates that there are duplicate values. 
These duplicates values were dropped.

In [None]:
# removes duplicates of tweets
selected_df['original_text'] = selected_df.original_text.drop_duplicates()

# Drops missing values
selected_df.dropna(inplace=True)

### Text cleaning

In [None]:
# creates a string of characters to be taken out
unwanted = ["'\n'","’"]
take_out = ''
for char in unwanted:
    take_out = string.punctuation + char 

In [None]:
# function to clean tweet 

def remove_punct_and_clean(tweet):
    
    for char in unwanted:
        take_out = string.punctuation + char 
    tweet = emoji.get_emoji_regexp().sub(r'', tweet)
    tweet  = "".join([char for char in tweet if char not in take_out])
    tweet = re.sub('[0-9]+', '', tweet)
    tweet = tweet.lower()
    
    return tweet

In [None]:
# cleans and adds column to selected_df
selected_df['cleaned_text'] = selected_df['original_text'].apply(remove_punct_and_clean)

In [None]:
# converts to string
selected_df['cleaned_text'] = selected_df['cleaned_text'].astype(str)                   

# converts to string
selected_df['cleaned_text'] = selected_df['cleaned_text'].apply(lambda x: x.lower())    # converts to lowercase

In [None]:
# Displays a figure of the most used words
plt.figure(figsize=(20, 10))
plt.imshow(WordCloud(width=1000,height=600,stopwords=STOPWORDS).generate(' '.join(selected_df.cleaned_text.values)))
plt.axis('off')
plt.title('Most Frequent Words In Our Tweets',fontsize=16)
plt.show()

In [None]:
gm = {hi:9}

In [None]:
gm['hi']