In [1]:
import pandas as pd
from textblob import TextBlob
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Read processed data
df = pd.read_csv('processed_tweets.csv')

In [3]:
df.head()

Unnamed: 0,tweet_text,cyberbullying_type,clean_tweets,lemmatized_tweets
0,"In other words #katandandre, your food was cra...",not_cyberbullying,words katandandre food crapilicious,word katandandre food crapilicious
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying,aussietv white theblock imacelebrityau today s...,aussietv white theblock imacelebrityau today s...
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying,classy whore red velvet cupcakes,classy whore red velvet cupcake
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying,meh p thanks heads concerned another angry dud...,meh p thanks head concerned another angry dude...
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying,isis account pretending kurdish account like i...,isi account pretending kurdish account like is...


In [4]:
df.dropna(inplace=True)

In [5]:
df.isna().sum()

tweet_text            0
cyberbullying_type    0
clean_tweets          0
lemmatized_tweets     0
dtype: int64

In [6]:
# NLTK Sentiment Analysis (VADER)

In [7]:
from nltk.sentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

In [8]:
df['lemmatized_tweets'] = df['lemmatized_tweets'].astype(str)
df['polarity_nltk'] = df['lemmatized_tweets'].apply(lambda x: analyzer.polarity_scores(x)['compound'])

In [9]:
df.head()

Unnamed: 0,tweet_text,cyberbullying_type,clean_tweets,lemmatized_tweets,polarity_nltk
0,"In other words #katandandre, your food was cra...",not_cyberbullying,words katandandre food crapilicious,word katandandre food crapilicious,0.0
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying,aussietv white theblock imacelebrityau today s...,aussietv white theblock imacelebrityau today s...,0.0
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying,classy whore red velvet cupcakes,classy whore red velvet cupcake,-0.34
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying,meh p thanks heads concerned another angry dud...,meh p thanks head concerned another angry dude...,-0.1779
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying,isis account pretending kurdish account like i...,isi account pretending kurdish account like is...,0.4404


In [10]:
# TextBlob Sentiment Analysis

In [11]:
# def getSubjectivity(text):
#     return TextBlob(text).sentiment.subjectivity
  
 #Create a function to get the polarity
def getPolarity(text):
    return TextBlob(text).sentiment.polarity
  
# Create two new columns ‘Subjectivity’ & ‘Polarity’
# df['TextBlob_Subjectivity'] =    df['lemmatized_tweets'].apply(getSubjectivity)
df['polarity_textblob'] = df['lemmatized_tweets'].apply(getPolarity)
# def getAnalysis(score):
#         if score < 0:
#             return 'Negative'
#         elif score == 0:
#             return 'Neutral'
#         else:
#             return 'Positive'
# df['TextBlob_Analysis'] = df['TextBlob_Polarity'].apply(getAnalysis )

In [12]:
df.head()

Unnamed: 0,tweet_text,cyberbullying_type,clean_tweets,lemmatized_tweets,polarity_nltk,polarity_textblob
0,"In other words #katandandre, your food was cra...",not_cyberbullying,words katandandre food crapilicious,word katandandre food crapilicious,0.0,0.0
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying,aussietv white theblock imacelebrityau today s...,aussietv white theblock imacelebrityau today s...,0.0,0.0
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying,classy whore red velvet cupcakes,classy whore red velvet cupcake,-0.34,0.05
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying,meh p thanks heads concerned another angry dud...,meh p thanks head concerned another angry dude...,-0.1779,-0.15
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying,isis account pretending kurdish account like i...,isi account pretending kurdish account like is...,0.4404,0.0


In [13]:
# Individual Category analysis

In [14]:
# Cyberbullying Polarity Histograms
def polarity_hist(bully_type):
    bully_data = df.loc[df.cyberbullying_type == bully_type]
    print(bully_data.shape)
    for polarity in ['polarity_nltk', 'polarity_textblob']:
        sns.histplot(bully_data, x=polarity)
        plt.title('{} Polarity Distribution {}'.format(bully_type, polarity))
        plt.savefig('sentiment_plots/{}_{}.png'.format(bully_type, polarity))
        plt.clf()
        continue

In [15]:
for bully in list(df['cyberbullying_type'].unique()):
    polarity_hist(bully)

(7846, 6)
(7949, 6)
(7998, 6)
(7632, 6)
(7992, 6)
(7960, 6)


<Figure size 432x288 with 0 Axes>

In [16]:
df.to_csv('model_data.csv', index=False)