In [34]:

import pandas as pd
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from collections import Counter
import re
from nltk.util import bigrams, trigrams
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


In [35]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jiyon\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jiyon\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [36]:
followers = pd.read_csv("../Dataset/tweets/followers_data.csv")
following = pd.read_csv("../Dataset/tweets/followings_data.csv")
user = pd.read_csv("../Dataset/tweets/user_data.csv")

In [37]:
following

Unnamed: 0,user,tweets,labels,follows
0,user1,"b""@theresa_may U are a complete bitch an a sel...",1,following1
1,user1,"b'RT @NadineDorries: So, it\xe2\x80\x99s true....",4,following1
2,user1,"b""@theresa_may @10DowningStreet Why do u never...",4,following1
3,user1,"b""@Anna_Soubry @theresa_may @sarahwollaston @B...",1,following1
4,user1,b'RT @RealitySmash: The left want this video o...,0,following1
...,...,...,...,...
4259,user52,And she's pushing for Sharia Law in America. ...,0,following65
4260,user52,RT @RGodGivenRights: @Lrihendry @PinkBelgium @...,0,following65
4261,user52,RT @monk_asian: #Sharia's procedure of #hand a...,0,following65
4262,user52,RT @LanceSilver1: #Sharia Law VIOLATES Our #Bi...,0,following65


In [38]:
stopwords_set = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = re.sub(r"b['\"]", '', text)
    text = re.sub(r'\bRT\b', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'\\\w+', '', text)
    text = re.sub(r'\\x[a-fA-F0-9]{2}', '', text)
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.split(r'\btruncated\b', text, maxsplit=1)[0]
    text = re.sub(r'#', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = text.lower()

    tokens = word_tokenize(text)

    filtered_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords_set]
    text = ' '.join(filtered_tokens)

    return text


In [39]:
following['cleaned'] = following['tweets'].apply(lambda x: clean_text(x))

In [40]:
following

Unnamed: 0,user,tweets,labels,follows,cleaned
0,user1,"b""@theresa_may U are a complete bitch an a sel...",1,following1,u complete bitch sell part deal dont u underst...
1,user1,"b'RT @NadineDorries: So, it\xe2\x80\x99s true....",4,following1,true barnier eu pushing u deal leaver want eu ...
2,user1,"b""@theresa_may @10DowningStreet Why do u never...",4,following1,u never celebrate st george day didnt u cover ...
3,user1,"b""@Anna_Soubry @theresa_may @sarahwollaston @B...",1,following1,fuck bit leave dont u understand
4,user1,b'RT @RealitySmash: The left want this video o...,0,following1,left want video muslim peaceful muslimming she...
...,...,...,...,...,...
4259,user52,And she's pushing for Sharia Law in America. ...,0,following65,shes pushing sharia law america shes dangerous
4260,user52,RT @RGodGivenRights: @Lrihendry @PinkBelgium @...,0,following65,people cant allow illegals rewrite law sharia ...
4261,user52,RT @monk_asian: #Sharia's procedure of #hand a...,0,following65,sharia procedure hand foot chopping tie reduce...
4262,user52,RT @LanceSilver1: #Sharia Law VIOLATES Our #Bi...,0,following65,sharia law violates billofrights 1440 yr clash...


In [41]:
to_drop = following[following['cleaned'] == ""].index
following.drop(to_drop, inplace=True)

In [42]:
following[['tweets', 'cleaned', "labels"]]

Unnamed: 0,tweets,cleaned,labels
0,"b""@theresa_may U are a complete bitch an a sel...",u complete bitch sell part deal dont u underst...,1
1,"b'RT @NadineDorries: So, it\xe2\x80\x99s true....",true barnier eu pushing u deal leaver want eu ...,4
2,"b""@theresa_may @10DowningStreet Why do u never...",u never celebrate st george day didnt u cover ...,4
3,"b""@Anna_Soubry @theresa_may @sarahwollaston @B...",fuck bit leave dont u understand,1
4,b'RT @RealitySmash: The left want this video o...,left want video muslim peaceful muslimming she...,0
...,...,...,...
4259,And she's pushing for Sharia Law in America. ...,shes pushing sharia law america shes dangerous,0
4260,RT @RGodGivenRights: @Lrihendry @PinkBelgium @...,people cant allow illegals rewrite law sharia ...,0
4261,RT @monk_asian: #Sharia's procedure of #hand a...,sharia procedure hand foot chopping tie reduce...,0
4262,RT @LanceSilver1: #Sharia Law VIOLATES Our #Bi...,sharia law violates billofrights 1440 yr clash...,0


In [43]:
following.to_csv("../Dataset/tweets/cleaned.csv")

In [44]:
def create_ngrams(text, n):
    tokens = word_tokenize(text)
    bigrams = list(ngrams(tokens, n))
    return bigrams

In [45]:
following['bigram'] = following['cleaned'].apply(lambda x: create_ngrams(x, 2))
following['trigram'] = following['cleaned'].apply(lambda x: create_ngrams(x, 3))

In [46]:
def make_ngram_df(column='bigram'):
    
    rows = []

    for index, row in following.groupby("labels"):
        flattened = row[column].explode().tolist()
        counter = Counter(flattened)
        for ngram, count in counter.most_common(1):
            rows.append({'category': index, column: ngram, 'count': count})

    ngram_df = pd.DataFrame(rows)
    return ngram_df

In [47]:
make_ngram_df('bigram').to_csv("../Dataset/tweets/results/twitter_top_bigram_by_category.csv")

In [48]:
make_ngram_df('trigram').to_csv("../Dataset/tweets/results/twitter_top_trigram_by_category.csv")
