# Sentiment Analysis of Twitter

In [1]:
# libraries

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import json
import os

import re
import spacy 
from spacy import displacy
import gensim
nlp = spacy.load("en_core_web_sm")

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
import string
import contractions
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob

from pandas_profiling import ProfileReport
import seaborn as sns
import matplotlib.pyplot as plt

from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel

import missingno as msno
from datetime import date
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler, LabelEncoder,StandardScaler, RobustScaler

pd.set_option("display.max_columns",None)
pd.set_option("display.width",500)
sns.set(rc={"figure.figsize":(12,12)})

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.width = None
pd.options.display.max_colwidth = None

[nltk_data] Downloading package stopwords to C:\Users\Melis
[nltk_data]     Nur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Melis
[nltk_data]     Nur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Define the directory where the JSON files are located
json_dir = 'data/'

# Create an empty list to store the dataframes
df_list = []

# Loop through all files in the directory
for filename in os.listdir(json_dir):
    # Check if the file is a JSON file
    if filename.endswith('.json'):
        # Load the JSON data
        with open(os.path.join(json_dir, filename), 'r', encoding='ISO-8859-1') as f:
            data = json.load(f)
        # Convert the dictionary to a DataFrame and append it to the list
        df_list.append(pd.DataFrame.from_dict(data))

# Concatenate all dataframes in the list into one dataframe
data = pd.concat(df_list, ignore_index=True)

In [3]:
data = data[["text", "id","created_at"]]
data = data.iloc[:8000]
data.head()

Unnamed: 0,text,id,created_at
0,"@peanut_pitbull @ChrisJo80574828 @TheRealEarns @MarinaMedvin The government has no money they spend more than they receive via taxes. So they borrow from the fed which worsens inflation,worsens wealth inequality, and inflates bubbles. What good are roads etc if you are priced out of purchasing goods, unable to pay rent, and no savings",1457055231225778177,2021-11-06T18:40:12.000Z
1,"@LitUpMagazine1 @thomaschattwill @nhannahjones Well, there is an argument to be made for people coming from different backgrounds and how that affects them. Which also mostly explain the inequality better than race: poor people tend to do worse than middle class/rich people and some ethnicities tends to be poorer than others.",1457055229468348420,2021-11-06T18:40:12.000Z
2,There's nothing libs love more than talking about institutional problems. Through parliaments one can set up all sort of investigative committees and set proposals. It's a great way to not actually address the economic inequality faced by the black community.,1457055203388280833,2021-11-06T18:40:05.000Z
3,Inequality has effects everywhere https://t.co/MK6E2AiVzq,1457055130914861057,2021-11-06T18:39:48.000Z
4,"/6-Now, 'equity'-based education being handled by Equity Alliance of Minnesota. I don't know anything about them. I haven't heard why the change. Anyone?\nMy point: Focus changed from ""equity of outcomes"" to guaranteed inequity, inequality, with no notice, explanation. More ?s",1457054652197920770,2021-11-06T18:37:54.000Z


In [4]:
len(data)

8000

## Cleaning Data

In [5]:
def preprocess_text(text):
    # removing URLs
    url_regex = re.compile(r'http\S+')
    text = url_regex.sub('', text)
    
    # expanding contractions
    text = contractions.fix(text)
    
    # removing special characters and digits
    text = re.sub('[^a-zA-Z]', ' ', text)
    
    # converting to lowercase
    text = text.lower()
    
    # tokenizing text
    tokens = word_tokenize(text)
    
    # removing stopwords
    stop_words = stopwords.words('english')
    tokens = [word for word in tokens if word not in stop_words]
    
    # lemmatizing tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # joining tokens back into a single string
    text = ' '.join(tokens)
    
    return text

In [6]:
# Apply preprocessing function to 'text' column of DataFrame
data['cleaned_text'] = data['text'].apply(preprocess_text)

# Tokenize the text in the 'text' column of your dataframe
texts = [[token.text for token in nlp(text)] for text in data['cleaned_text']]

# Create a dictionary of unique words from the tokenized texts
dictionary = Dictionary(texts)

# Convert the tokenized texts to bag-of-words format using the dictionary
corpus = [dictionary.doc2bow(text) for text in texts]

# Train an LDA model on the corpus
lda_model = LdaModel(corpus=corpus, num_topics=5, id2word=dictionary)

In [7]:
def get_dominant_topic(model, corpus, texts):
    # Get main topic in each document
    topic_keywords = model.show_topics(formatted=False)
    dominant_topic = []
    for i, row in enumerate(model[corpus]):
        row = sorted(row, key=lambda x: x[1], reverse=True)
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # get dominant topic
                topic = topic_keywords[topic_num]
                words = [word for word, prop in topic[1]]
                topic_name = ', '.join(words)
                dominant_topic.append(topic_name)
            else:
                break
    return dominant_topic

In [8]:
data['topic'] = get_dominant_topic(lda_model, corpus, data['cleaned_text'])

In [9]:
data.head()

Unnamed: 0,text,id,created_at,cleaned_text,topic
0,"@peanut_pitbull @ChrisJo80574828 @TheRealEarns @MarinaMedvin The government has no money they spend more than they receive via taxes. So they borrow from the fed which worsens inflation,worsens wealth inequality, and inflates bubbles. What good are roads etc if you are priced out of purchasing goods, unable to pay rent, and no savings",1457055231225778177,2021-11-06T18:40:12.000Z,peanut pitbull chrisjo therealearns marinamedvin government money spend receive via tax borrow fed worsens inflation worsens wealth inequality inflates bubble good road etc priced purchasing good unable pay rent saving,"inequality, climate, people, racial, must, wealth, amp, history, poverty, change"
1,"@LitUpMagazine1 @thomaschattwill @nhannahjones Well, there is an argument to be made for people coming from different backgrounds and how that affects them. Which also mostly explain the inequality better than race: poor people tend to do worse than middle class/rich people and some ethnicities tends to be poorer than others.",1457055229468348420,2021-11-06T18:40:12.000Z,litupmagazine thomaschattwill nhannahjones well argument made people coming different background affect also mostly explain inequality better race poor people tend worse middle class rich people ethnicity tends poorer others,"inequality, people, amp, health, u, one, income, economic, new, need"
2,There's nothing libs love more than talking about institutional problems. Through parliaments one can set up all sort of investigative committees and set proposals. It's a great way to not actually address the economic inequality faced by the black community.,1457055203388280833,2021-11-06T18:40:05.000Z,nothing libs love talking institutional problem parliament one set sort investigative committee set proposal great way actually address economic inequality faced black community,"inequality, people, amp, health, u, one, income, economic, new, need"
3,Inequality has effects everywhere https://t.co/MK6E2AiVzq,1457055130914861057,2021-11-06T18:39:48.000Z,inequality effect everywhere,"inequality, climate, people, racial, must, wealth, amp, history, poverty, change"
4,"/6-Now, 'equity'-based education being handled by Equity Alliance of Minnesota. I don't know anything about them. I haven't heard why the change. Anyone?\nMy point: Focus changed from ""equity of outcomes"" to guaranteed inequity, inequality, with no notice, explanation. More ?s",1457054652197920770,2021-11-06T18:37:54.000Z,equity based education handled equity alliance minnesota know anything heard change anyone point focus changed equity outcome guaranteed inequity inequality notice explanation,"inequality, wealth, amp, america, control, social, via, spiraled, problem, gender"


In [10]:
def getSubjectivity(review):
    return TextBlob(review).sentiment.subjectivity

In [11]:
def getPolarity(review):
    return TextBlob(review).sentiment.polarity

In [12]:
# function to analyze the reviews
def analysis(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

In [13]:
data['Subjectivity'] =data['cleaned_text'].apply(getSubjectivity) 
data['Polarity'] = data['cleaned_text'].apply(getPolarity) 
data['Analysis'] = data['Polarity'].apply(analysis)

In [14]:
data.head()

Unnamed: 0,text,id,created_at,cleaned_text,topic,Subjectivity,Polarity,Analysis
0,"@peanut_pitbull @ChrisJo80574828 @TheRealEarns @MarinaMedvin The government has no money they spend more than they receive via taxes. So they borrow from the fed which worsens inflation,worsens wealth inequality, and inflates bubbles. What good are roads etc if you are priced out of purchasing goods, unable to pay rent, and no savings",1457055231225778177,2021-11-06T18:40:12.000Z,peanut pitbull chrisjo therealearns marinamedvin government money spend receive via tax borrow fed worsens inflation worsens wealth inequality inflates bubble good road etc priced purchasing good unable pay rent saving,"inequality, climate, people, racial, must, wealth, amp, history, poverty, change",0.566667,0.3,Positive
1,"@LitUpMagazine1 @thomaschattwill @nhannahjones Well, there is an argument to be made for people coming from different backgrounds and how that affects them. Which also mostly explain the inequality better than race: poor people tend to do worse than middle class/rich people and some ethnicities tends to be poorer than others.",1457055229468348420,2021-11-06T18:40:12.000Z,litupmagazine thomaschattwill nhannahjones well argument made people coming different background affect also mostly explain inequality better race poor people tend worse middle class rich people ethnicity tends poorer others,"inequality, people, amp, health, u, one, income, economic, new, need",0.507143,0.082143,Positive
2,There's nothing libs love more than talking about institutional problems. Through parliaments one can set up all sort of investigative committees and set proposals. It's a great way to not actually address the economic inequality faced by the black community.,1457055203388280833,2021-11-06T18:40:05.000Z,nothing libs love talking institutional problem parliament one set sort investigative committee set proposal great way actually address economic inequality faced black community,"inequality, people, amp, health, u, one, income, economic, new, need",0.416667,0.266667,Positive
3,Inequality has effects everywhere https://t.co/MK6E2AiVzq,1457055130914861057,2021-11-06T18:39:48.000Z,inequality effect everywhere,"inequality, climate, people, racial, must, wealth, amp, history, poverty, change",0.0,0.0,Neutral
4,"/6-Now, 'equity'-based education being handled by Equity Alliance of Minnesota. I don't know anything about them. I haven't heard why the change. Anyone?\nMy point: Focus changed from ""equity of outcomes"" to guaranteed inequity, inequality, with no notice, explanation. More ?s",1457054652197920770,2021-11-06T18:37:54.000Z,equity based education handled equity alliance minnesota know anything heard change anyone point focus changed equity outcome guaranteed inequity inequality notice explanation,"inequality, wealth, amp, america, control, social, via, spiraled, problem, gender",0.0,0.0,Neutral


In [15]:
#data= data[:8000]
#data.to_csv('100_tweet_data.csv', index=False)

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   text          8000 non-null   object 
 1   id            8000 non-null   object 
 2   created_at    8000 non-null   object 
 3   cleaned_text  8000 non-null   object 
 4   topic         8000 non-null   object 
 5   Subjectivity  8000 non-null   float64
 6   Polarity      8000 non-null   float64
 7   Analysis      8000 non-null   object 
dtypes: float64(2), object(6)
memory usage: 500.1+ KB


## Exploratory Data Analysis

In [17]:
temp = data.groupby('Analysis').count()['cleaned_text'].reset_index().sort_values(by='cleaned_text', ascending=False)
temp.style.background_gradient(cmap='Purples')

Unnamed: 0,Analysis,cleaned_text
2,Positive,3996
1,Neutral,2030
0,Negative,1974


In [19]:
import datetime

# Convert the dates to datetime objects
parsed_dates = [datetime.datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") for date in data["created_at"]]

# Find the first and last dates
first_date = min(parsed_dates)
last_date = max(parsed_dates)

# Print the first and last dates
print("First date:", first_date)
print("Last date:", last_date)

First date: 2021-11-06 14:52:20
Last date: 2021-11-09 14:57:00


In [22]:
data["created_at"].unique()

array(['2021-11-06T18:40:12.000Z', '2021-11-06T18:40:05.000Z',
       '2021-11-06T18:39:48.000Z', ..., '2021-11-09T14:00:03.000Z',
       '2021-11-09T14:00:02.000Z', '2021-11-09T14:00:00.000Z'],
      dtype=object)

In [23]:
data["created_at"].nunique()

7826

In [57]:
wikipedia_events = pd.read_csv("wikipedia_events.csv")
wikipedia_events.head()

Unnamed: 0,Date,Title,Event
0,"July 1, 2021",Armed conflicts and attacks,Allied Democratic Forces insurgency
1,"July 1, 2021",Armed conflicts and attacks,Ugandan
2,"July 1, 2021",Armed conflicts and attacks,Ambassador
3,"July 1, 2021",Arts and culture,2021 Canadian Indian residential schools gravesite discoveries
4,"July 1, 2021",Arts and culture,Indigenous


In [59]:
wikipedia_events.tail()

Unnamed: 0,Date,Title,Event
6744,"November 30, 2021",Politics and elections,2021 Honduran general election
6745,"November 30, 2021",Politics and elections,centre
6746,"November 30, 2021",Sports,Impact of the COVID-19 pandemic on sports
6747,"November 30, 2021",Sports,2021–22 NBA season
6748,"November 30, 2021",Sports,Los Angeles Lakers


In [60]:
wikipedia_events.drop_duplicates(keep=False, inplace=True)

In [62]:
len(wikipedia_events)

6711