## IMPORT MODULES

In [22]:
import pandas as pd
import sys
import os
import re

sys.path.append(os.path.abspath(os.path.join('..')))
from extract_dataframe import TweetDfExtractor
from extract_dataframe import read_json
from clean_tweets_dataframe import Clean_Tweets
_, global_tweet_list = read_json("../data/global_twitter_data.json")

## INITIALIZE TweetDfExtractor INSTANCE AND GET DATA

In [23]:
global_tweet_df_extractor = TweetDfExtractor(global_tweet_list)
global_tweets_df = global_tweet_df_extractor.get_tweet_df()

## DATA PREPARATION

#### CHECK IF ALL DESIRED COLUMNS ARE EXTRACTED 

In [24]:
global_tweets_df.head()

Unnamed: 0,created_at,source,full_text,polarity,subjectivity,sentiment,lang,favorite_count,retweet_count,original_author,followers_count,friends_count,possibly_sensitive,hashtags,user_mentions,place
0,Sun Aug 07 22:31:20 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @i_ameztoy Extra random image I Lets focus ...,-0.125,0.190625,negative,en,15760,2,i_ameztoy,20497,2621,,City++++,i_ameztoy++++,
1,Sun Aug 07 22:31:16 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @IndoPac_Info #Chinas media explains the mi...,-0.1,0.1,negative,en,6967,201,ZIisq,65,272,,China++++Taiwan++++,IndoPac_Info++++,
2,Sun Aug 07 22:31:07 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",China even cut off communication they dont anw...,0.0,0.0,neutral,en,2166,0,Fin21Free,85,392,,XiJinping++++,ZelenskyyUa++++,Netherlands
3,Sun Aug 07 22:31:06 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",Putin to #XiJinping I told you my friend Taiwa...,0.1,0.35,positive,en,2166,0,Fin21Free,85,392,,XiJinping++++,,Netherlands
4,Sun Aug 07 22:31:04 +0000 2022,"<a href=""http://twitter.com/download/iphone"" r...",RT @ChinaUncensored I’m sorry I thought Taiwan...,-6.938894e-18,0.55625,negative,en,17247,381,VizziniDolores,910,2608,,,ChinaUncensored++++,"Ayent, Schweiz"


In [25]:
global_tweets_df.shape

(22000, 16)

#### INITIALIZE Clean_Tweets INSTANCE

In [26]:
global_clean_tweets = Clean_Tweets(global_tweets_df)

Clean_Tweets INSTANCE CREATED


#### DROP DUPLICATES 
##### BY USING PANDAS df.drop_duplicates() which is called by our global_clean_tweets.drop_duplicate(global_tweets_df) function

In [27]:
global_tweets_df = global_clean_tweets.drop_duplicate(global_tweets_df)
global_tweets_df.shape
# We can see that on the below output we have droped 3 duplicate rows.

(21997, 16)

#### REMOVE NON ENGLISH

In [28]:
global_tweets_df = global_clean_tweets.remove_non_english_tweets(global_tweets_df)
global_tweets_df.shape
# We can see no rows were dropped because all the languages represent 'en'

(21997, 16)

#### CHECK FOR NULL/MISSING VALUES

In [29]:
global_tweets_df.isna().sum()
# We can see that we have 15807 missing values under the column 'possibly_sensitive'

created_at                0
source                    0
full_text                 0
polarity                  0
subjectivity              0
sentiment                 0
lang                      0
favorite_count            0
retweet_count             0
original_author           0
followers_count           0
friends_count             0
possibly_sensitive    15807
hashtags                  0
user_mentions             0
place                     0
dtype: int64

##### HANDLE MISSING DATA

In [30]:
global_tweets_df['possibly_sensitive'] = global_tweets_df['possibly_sensitive'].fillna("not_found")
global_tweets_df.isna().sum()
# The missing rows under the column 'possibly_sensitive' have been repaced with "not_found"

created_at            0
source                0
full_text             0
polarity              0
subjectivity          0
sentiment             0
lang                  0
favorite_count        0
retweet_count         0
original_author       0
followers_count       0
friends_count         0
possibly_sensitive    0
hashtags              0
user_mentions         0
place                 0
dtype: int64

In [31]:
global_tweets_df[global_tweets_df['place'] == ''].shape
# Eventhough on the above output there were 0 missing data under the column 'place' those missing data's were 
# an empty string rather than a null value, so we have to replace those value with "not_known"

(9890, 16)

In [32]:
global_tweets_df['place'] = global_tweets_df['place'].fillna("not_known")
global_tweets_df.isna().sum()


created_at            0
source                0
full_text             0
polarity              0
subjectivity          0
sentiment             0
lang                  0
favorite_count        0
retweet_count         0
original_author       0
followers_count       0
friends_count         0
possibly_sensitive    0
hashtags              0
user_mentions         0
place                 0
dtype: int64

#### REMOVE UNWANTED CHARACTERS

In [33]:
global_tweets_df['place'][124:126]

124    Nunya*
125          
Name: place, dtype: object

In [34]:
global_tweets_df = global_clean_tweets.remove_place_characters(global_tweets_df)
global_tweets_df.isna().sum()

created_at            0
source                0
full_text             0
polarity              0
subjectivity          0
sentiment             0
lang                  0
favorite_count        0
retweet_count         0
original_author       0
followers_count       0
friends_count         0
possibly_sensitive    0
hashtags              0
user_mentions         0
place                 0
dtype: int64

In [35]:
global_tweets_df['place'][124:126]
# We can see that invalid characters under the column 'place' are replaced and cleaned 

124    Nunya
125         
Name: place, dtype: object

#### CONVERT STRING TO DATETIME

In [36]:
global_tweets_df.info()
# We can see that below the column 'created_at' has the type of object/string, we will change this to
# datetime using pd.to_datetime(df) which is called by global_clean_tweets.convert_to_datetime(global_tweets_df) 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21997 entries, 0 to 21999
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   created_at          21997 non-null  object 
 1   source              21997 non-null  object 
 2   full_text           21997 non-null  object 
 3   polarity            21997 non-null  float64
 4   subjectivity        21997 non-null  float64
 5   sentiment           21997 non-null  object 
 6   lang                21997 non-null  object 
 7   favorite_count      21997 non-null  int64  
 8   retweet_count       21997 non-null  int64  
 9   original_author     21997 non-null  object 
 10  followers_count     21997 non-null  int64  
 11  friends_count       21997 non-null  int64  
 12  possibly_sensitive  21997 non-null  object 
 13  hashtags            21997 non-null  object 
 14  user_mentions       21997 non-null  object 
 15  place               21997 non-null  object 
dtypes: f

In [37]:
global_tweets_df = global_clean_tweets.convert_to_datetime(global_tweets_df)
global_tweets_df.info()
# Now we can see that it is converted to datetime64

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21997 entries, 0 to 21999
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   created_at          21997 non-null  datetime64[ns, UTC]
 1   source              21997 non-null  object             
 2   full_text           21997 non-null  object             
 3   polarity            21997 non-null  float64            
 4   subjectivity        21997 non-null  float64            
 5   sentiment           21997 non-null  object             
 6   lang                21997 non-null  object             
 7   favorite_count      21997 non-null  int64              
 8   retweet_count       21997 non-null  int64              
 9   original_author     21997 non-null  object             
 10  followers_count     21997 non-null  int64              
 11  friends_count       21997 non-null  int64              
 12  possibly_sensitive  21997 non-nu