In [58]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from sklearn.ensemble import RandomForestClassifier
from wordcloud import WordCloud
import time


In [59]:
#display settings
pd.set_option('display.max_rows',1000)
pd.set_option('display.max_columns',1000)

In [60]:
#reading data and viewing first 5 rows
naira_value = pd.read_csv ('data/naira_value.csv')
naira_value.head()

Unnamed: 0.1,Unnamed: 0,Datetime,Tweet Id,Text,Username
0,0,2014-11-30 20:16:29+00:00,5.39151e+17,"@bhurharhi higher, because naira has low value...",411sevices
1,1,2014-11-30 20:13:52+00:00,5.391503e+17,AntiGayLaw - Gone\nNaira Value - what value?\n...,thetitoway
2,2,2014-11-30 20:08:51+00:00,5.391491e+17,@remiopakunle maybe not. Thinking of head lamp...,doysol_
3,3,2014-11-30 16:14:40+00:00,5.390901e+17,I just lost like $70 because Naira value went ...,TomiwaBrown
4,4,2014-11-30 15:42:59+00:00,5.390822e+17,In 30yrs the value of naira was murdered! E,DaddyChelsea


In [61]:
#reading data and viewing first 5 rows
naira_dollar = pd.read_csv ('data/naira_dollar.csv')
naira_dollar.head()

Unnamed: 0.1,Unnamed: 0,Datetime,Tweet Id,Text,Username
0,0,2014-11-30 23:51:38+00:00,5.392051e+17,Naira devaluation will have Mixed effect on ec...,naijanewso
1,1,2014-11-30 23:38:47+00:00,5.392019e+17,"Nigeria: Our oyel money, dollar, naira, and fo...",Village_Square
2,2,2014-11-30 23:21:23+00:00,5.391975e+17,"Nigeria: Our oyel money, dollar, naira, and fo...",EagleReporters
3,3,2014-11-30 23:21:18+00:00,5.391975e+17,"Nigeria: Our oyel money, dollar, naira, and fo...",EagleReporters
4,4,2014-11-30 23:10:02+00:00,5.391947e+17,#Ginjaland | News: Falling oil price: CBN cuts...,ginjaland


In [62]:
#reading data and viewing first 5 rows
supplemental_tweets = pd.read_csv ('data/new_tweets.csv')
supplemental_tweets.head()

Unnamed: 0.1,Unnamed: 0,Datetime,Tweet Id,Text,Username
0,0,2020-01-30 23:08:43+00:00,1.22302e+18,In the 70's the Naira had more value than the ...,VintageMAk
1,1,2020-01-30 22:18:17+00:00,1.223008e+18,@MaiSharifai @paki_yusuf @AysherSadiq @_yakawa...,yakubu_mj
2,2,2020-01-30 21:50:55+00:00,1.223001e+18,@VanGelder_ Liberia dollar or naira,Selenu___
3,3,2020-01-30 21:47:56+00:00,1.223e+18,@MizzAina If I had a dollar for every time you...,TheToluAdeyi
4,4,2020-01-30 21:45:28+00:00,1.222999e+18,Good and not so good.\nMost people do not know...,AauraFragrance


# Data Cleaning

In [63]:
#dropping "unnamed" columns from dataframes
naira_value.drop(columns = 'Unnamed: 0', inplace = True)
naira_dollar.drop(columns = 'Unnamed: 0', inplace = True)
supplemental_tweets.drop(columns = 'Unnamed: 0', inplace = True)

### Concatenating "naira dollar" and "naira value" tweets 

In [64]:
print(naira_dollar.shape)
print(naira_value.shape)
print(supplemental_tweets.shape)

(68713, 4)
(45421, 4)
(8629, 4)


In [65]:
naira_tweets = pd.concat([naira_dollar, naira_value, supplemental_tweets])
naira_tweets.sort_index(inplace = True)
naira_tweets.shape

(122763, 4)

In [66]:
#checking count of unique values
naira_tweets.nunique()

Datetime    115598
Tweet Id    119957
Text        116330
Username     51226
dtype: int64

In [67]:
#dropping duplicates
naira_tweets.drop_duplicates(subset=['Text'],keep='last',inplace=True)

For the purpose of this analysis, we will be ignoring the fact that multiple tweets may have come from one user. This is however relevant information to keep in mind when interpreting results from the analysis. We will however drop duplicate tweets.

In [68]:
naira_tweets.shape

(116331, 4)

In [69]:
#checking info about dataframe
naira_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 116331 entries, 0 to 68712
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   Datetime  116331 non-null  object 
 1   Tweet Id  116330 non-null  float64
 2   Text      116330 non-null  object 
 3   Username  116312 non-null  object 
dtypes: float64(1), object(3)
memory usage: 4.4+ MB


In [70]:
#removing time component from datetime object
naira_tweets['Datetime']= naira_tweets['Datetime'].apply(lambda x:str(x)[:10])

In [71]:
#cleaning "datetime" column
naira_tweets.drop(naira_tweets[naira_tweets['Datetime']== 'mmannymash'].index, inplace= True)

In [72]:
#function to find rows with invalid dates 
def find_invalid_dates (string):
    if string[:4] not in str([2014, 2015, 2016, 2017, 2018, 2019, 2020]):
        return "yes"
    else:
        return "no"
    

In [73]:
#mapping function to dataframe
naira_tweets['invalid_date'] = naira_tweets.apply(lambda x:find_invalid_dates(x['Datetime']),axis =1)

In [74]:
#rows with "invalid dates"
naira_tweets[naira_tweets['invalid_date']=='yes']

Unnamed: 0,Datetime,Tweet Id,Text,Username,invalid_date
34938,ElegbaaFra,,,,yes


In [75]:
#dropping rows with invalid dates
naira_tweets.drop(naira_tweets[naira_tweets['invalid_date']=='yes'].index, inplace = True)

In [76]:
#converting column to datetime format
naira_tweets['Datetime']  = pd.to_datetime(naira_tweets['Datetime'])

In [77]:
#confirming datatypes
naira_tweets.dtypes

Datetime        datetime64[ns]
Tweet Id               float64
Text                    object
Username                object
invalid_date            object
dtype: object

In [78]:
#checking for nulls
naira_tweets.isnull().sum()

Datetime         0
Tweet Id         0
Text             0
Username        18
invalid_date     0
dtype: int64

There are nulls in the user name column, however we are mainly concerned with the text column.

In [79]:
#renaming columns
naira_tweets.rename(columns = {'Datetime': 'date',
                           'Text': 'text',
                           'Username': 'user_name',
                            'Tweet Id': 'tweet_id'
                           }, inplace = True)
naira_tweets.head()

Unnamed: 0,date,tweet_id,text,user_name,invalid_date
0,2014-11-30,5.392051e+17,Naira devaluation will have Mixed effect on ec...,naijanewso,no
0,2014-11-30,5.39151e+17,"@bhurharhi higher, because naira has low value...",411sevices,no
1,2014-11-30,5.392019e+17,"Nigeria: Our oyel money, dollar, naira, and fo...",Village_Square,no
1,2014-11-30,5.391503e+17,AntiGayLaw - Gone\nNaira Value - what value?\n...,thetitoway,no
2,2014-11-30,5.391975e+17,"Nigeria: Our oyel money, dollar, naira, and fo...",EagleReporters,no


In [80]:
#removing links from texts
naira_tweets['text'] = naira_tweets['text'].replace('http\S+', '', regex=True).replace('www\S+', '', regex=True).replace('\n\n\S+', '', regex=True)

In [81]:
#setting date as index
naira_tweets.set_index('date', inplace = True)

In [82]:
#sorting dataframe based on index
naira_tweets.sort_index(inplace = True)

## Exchange Rate Data

Nigeria has 3 different exchange rates depending on source and purpose. For this analysis, we would be focusing on daily frequency Bureau de Change (BDC) Naira to Dollar Exchange rate as this rate is more reflective of market conditions.

In [83]:
#reading data
bdc_rates = pd.read_csv('data/bdc_exchange_rate.csv')
bdc_rates.head()

Unnamed: 0.1,Unnamed: 0,nominal,BDC
0,2014-01-02,157.26,172.0
1,2014-01-03,157.26,172.0
2,2014-01-06,157.28,172.0
3,2014-01-07,157.28,172.0
4,2014-01-08,157.28,172.0


In [84]:
#renaming columns
bdc_rates.rename(columns = {'Unnamed: 0': 'date', 'BDC': 'bdc'}, inplace = True)
bdc_rates.head()

Unnamed: 0,date,nominal,bdc
0,2014-01-02,157.26,172.0
1,2014-01-03,157.26,172.0
2,2014-01-06,157.28,172.0
3,2014-01-07,157.28,172.0
4,2014-01-08,157.28,172.0


In [85]:
#changing index to datetime format
bdc_rates.date = pd.to_datetime(bdc_rates.date)
bdc_rates.set_index('date', inplace = True)

### Saving cleaned data as csv files

In [86]:
naira_tweets.to_csv('data/naira_tweets_cleaned.csv')
bdc_rates.to_csv('data/bdc_rates_cleaned.csv')