In [None]:
# importing libraries
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import string

# for data preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# NLP tools
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

# train split and fit models
from sklearn.model_selection import train_test_split
from sklearn import svm

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# New Section

In [None]:
import tensorflow as tf
device_name=tf.test.gpu_device_name()
if device_name!='/device:GPU:0':
  raise SystemError("GPU not found")
print("device found",device_name)

device found /device:GPU:0


In [None]:
dataset = pd.read_csv('/content/Tweets_Final_Dataset.csv')
dataset

Unnamed: 0,Index,Label,Tweet
0,1,1.0,@ceeque84 Yep!! Some asshole that doesnâ€™t ta...
1,2,1.0,"By that asshole's standards, ANYONE in America..."
2,3,0.0,RT @Scattered211: Just so we're really clear a...
3,4,1.0,@DuDuPlantier @BiggsL5T It's illegal to hog an...
4,5,1.0,RT @_gonfreeecss: @txtfrombrand @txtdrjkt Seba...
...,...,...,...
29996,29997,0.0,https://t.co/OIyRkOxke6\n\nI found this mother...
29997,29998,0.0,RT @MinisterMOFA: Our sincere congratulations ...
29998,29999,0.0,@_moonbeems ðŸ¥ºâ¤ï¸ youâ€™ll enjoy the peace
29999,30000,0.0,RT @julezlafiesta: That thing thatâ€™s botheri...


In [None]:
df = pd.DataFrame(dataset, columns = ['Index', 'Label', 'Tweet'])
df

Unnamed: 0,Index,Label,Tweet
0,1,1.0,@ceeque84 Yep!! Some asshole that doesnâ€™t ta...
1,2,1.0,"By that asshole's standards, ANYONE in America..."
2,3,0.0,RT @Scattered211: Just so we're really clear a...
3,4,1.0,@DuDuPlantier @BiggsL5T It's illegal to hog an...
4,5,1.0,RT @_gonfreeecss: @txtfrombrand @txtdrjkt Seba...
...,...,...,...
29996,29997,0.0,https://t.co/OIyRkOxke6\n\nI found this mother...
29997,29998,0.0,RT @MinisterMOFA: Our sincere congratulations ...
29998,29999,0.0,@_moonbeems ðŸ¥ºâ¤ï¸ youâ€™ll enjoy the peace
29999,30000,0.0,RT @julezlafiesta: That thing thatâ€™s botheri...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30001 entries, 0 to 30000
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Index   30001 non-null  int64  
 1   Label   29903 non-null  float64
 2   Tweet   30000 non-null  object 
dtypes: float64(1), int64(1), object(1)
memory usage: 703.3+ KB


In [None]:
df.isnull().values.any()

True

In [None]:
df.dropna(inplace=True)

In [None]:
df.isnull().values.any()

False

In [None]:
#Check class distribution in dependent variable 
display(df['Label'].value_counts().to_frame())

Unnamed: 0,Label
1.0,19287
0.0,10615


In [None]:
import re
import pandas as pd
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
import spacy

In [None]:
nlp = spacy.load('en', disable=['parser', 'ner'])

In [None]:
df.columns = df.columns.str.replace(' ','_')

Make Text Lowercase

In [None]:
df['New_Tweet'] = df['Tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df['New_Tweet'].head()

0    @ceeque84 yep!! some asshole that doesnâ€™t ta...
1    by that asshole's standards, anyone in america...
2    rt @scattered211: just so we're really clear a...
3    @duduplantier @biggsl5t it's illegal to hog an...
4    rt @_gonfreeecss: @txtfrombrand @txtdrjkt seba...
Name: New_Tweet, dtype: object

Remove Punctuation

In [None]:
df['New_Tweet'] = df['New_Tweet'].str.replace('[^\w\s]','')
df['New_Tweet'].head()

  """Entry point for launching an IPython kernel.


0    ceeque84 yep some asshole that doesnât take sh...
1    by that assholes standards anyone in america w...
2    rt scattered211 just so were really clear abou...
3    duduplantier biggsl5t its illegal to hog and p...
4    rt _gonfreeecss txtfrombrand txtdrjkt sebagian...
Name: New_Tweet, dtype: object

Remove Emojis

In [None]:
# REFERENCE : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags 
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           u"\U00002500-\U00002BEF"  # chinese char
                           u"\U0001f926-\U0001f937"
                           u"\U00010000-\U0010ffff"
                           u"\u2640-\u2642"
                           u"\u2600-\u2B55"
                           u"\u200d"
                           u"\u23cf"
                           u"\u23e9"
                           u"\u231a"
                           u"\ufe0f"  # dingbats
                           u"\u3030"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
df['New_Tweet'] = df['New_Tweet'].apply(lambda x: remove_emoji(x))

Remove Stopwords

In [None]:
stop = stopwords.words('english')
df['New_Tweet'] = df['New_Tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df.head(20)

Unnamed: 0,Index,Label,Tweet,New_Tweet
0,1,1.0,@ceeque84 Yep!! Some asshole that doesnâ€™t ta...,ceeque84 yep asshole doesnât take shit scares ...
1,2,1.0,"By that asshole's standards, ANYONE in America...",assholes standards anyone america reads j crew...
2,3,0.0,RT @Scattered211: Just so we're really clear a...,rt scattered211 really clear democrats dont wa...
3,4,1.0,@DuDuPlantier @BiggsL5T It's illegal to hog an...,duduplantier biggsl5t illegal hog park left la...
4,5,1.0,RT @_gonfreeecss: @txtfrombrand @txtdrjkt Seba...,rt _gonfreeecss txtfrombrand txtdrjkt sebagian...
5,6,0.0,thank you for God's sake thank you\n\nAswell a...,thank gods sake thank aswell moments asshole c...
6,7,0.0,RT @GamoDaBoss: i aint gone lie. white truck a...,rt gamodaboss aint gone lie white truck asshol...
7,8,1.0,Damn was tommy really an asshole like that. Se...,damn tommy really asshole like seth rogan play...
8,9,0.0,Would i be an asshole if i Doordashed in this ...,would asshole doordashed weather defense busy ...
9,10,1.0,"WOULD BE NICE if, you know, ANY rich asshole t...",would nice know rich asshole owns property wou...


Lemmatization

In [None]:
with tf.device("/gpu:0"):
  def space(comment):
    doc = nlp(comment)
    return " ".join([token.lemma_ for token in doc])
df['New_Tweet']= df['New_Tweet'].apply(space)
df.head(10)

Unnamed: 0,Index,Label,Tweet,New_Tweet
0,1,1.0,@ceeque84 Yep!! Some asshole that doesnâ€™t ta...,ceeque84 yep asshole doesnât take shit scare t...
1,2,1.0,"By that asshole's standards, ANYONE in America...",assholes standards anyone america read j crew ...
2,3,0.0,RT @Scattered211: Just so we're really clear a...,rt scattered211 really clear democrats do not ...
3,4,1.0,@DuDuPlantier @BiggsL5T It's illegal to hog an...,duduplantier biggsl5 t illegal hog park leave ...
4,5,1.0,RT @_gonfreeecss: @txtfrombrand @txtdrjkt Seba...,rt _ gonfreeecss txtfrombrand txtdrjkt sebagia...
5,6,0.0,thank you for God's sake thank you\n\nAswell a...,thank god sake thank aswell moment asshole cha...
6,7,0.0,RT @GamoDaBoss: i aint gone lie. white truck a...,rt gamodaboss be not go lie white truck asshol...
7,8,1.0,Damn was tommy really an asshole like that. Se...,damn tommy really asshole like seth rogan play...
8,9,0.0,Would i be an asshole if i Doordashed in this ...,would asshole doordashe weather defense busy w...
9,10,1.0,"WOULD BE NICE if, you know, ANY rich asshole t...",would nice know rich asshole own property woul...


In [None]:
# importing libraries

# methods and stopwords text processing
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# machine learning libraries
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

import warnings
warnings.filterwarnings('ignore')

Preprocessing

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def preprocess_tweet_text(tweet):

  # convert all text lowercase

  tweet = tweet. lower()

  # remove any urls
  tweet = re.sub(r"http\S+|www\St|https\st+", "", tweet, flags=re.MULTILINE)

  # remove punctuations
  tweet = tweet.translate(str.maketrans("", "", string.punctuation) )
  

  # remove user @ references and '#' from tweet

  tweet = re.sub(r'\@\wt|\#', "", tweet)

  # remove stopwords
  tweet_tokens = word_tokenize(tweet)
  filtered_words = [word for word in tweet_tokens if word not in stop_words]

  # stemming
  ps = PorterStemmer()
  stemmed_words = [ps.stem(w) for w in filtered_words]

  # lemmatizing
  lemmatizer = WordNetLemmatizer()
  lemma_words = [lemmatizer.lemmatize(w, pos='a') for w in stemmed_words]

  return " ".join(lemma_words)



In [None]:
df['New_Tweet'] = df['New_Tweet'].apply(preprocess_tweet_text)

Spliting Data

In [None]:
df['Num_words_text'] = df['Tweet'].apply(lambda x:len(str(x).split())) 

train_data,test_data= train_test_split(df, test_size=0.2)
train_data.reset_index(drop=True,inplace=True)
test_data.reset_index(drop=True,inplace=True)

In [None]:
#classes proportion in dependent variable in train and test dataset
print('===========Train Data =========')
print(train_data['Label'].value_counts())
print(len(train_data))
print('==============================')

print('===========Test Data =========')
print(test_data['Label'].value_counts())
print(len(test_data))
print('==============================')

1.0    15401
0.0     8520
Name: Label, dtype: int64
23921
1.0    3886
0.0    2095
Name: Label, dtype: int64
5981


In [None]:
from sklearn import preprocessing
#Encoding data
labelDict = {}

for feature in dataset:
    le = preprocessing.LabelEncoder()
    le.fit(dataset[feature])
    le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    dataset[feature] = le.transform(dataset[feature])
    # Get labels
    labelKey = 'label_' + feature
    labelValue = [*le_name_mapping]
    labelDict[labelKey] =labelValue
    
# for key, value in labelDict.items():     
#     print(key, value)

In [None]:
df

Unnamed: 0,Index,Label,Tweet,New_Tweet,Num_words_text
0,1,1.0,@ceeque84 Yep!! Some asshole that doesnâ€™t ta...,ceeque84 yep asshol doesnât take shit scare te...,18
1,2,1.0,"By that asshole's standards, ANYONE in America...",asshol standard anyon america read j crew cata...,22
2,3,0.0,RT @Scattered211: Just so we're really clear a...,rt scattered211 realli clear democrat want sus...,24
3,4,1.0,@DuDuPlantier @BiggsL5T It's illegal to hog an...,duduplanti biggsl5 illeg hog park leav lane la...,27
4,5,1.0,RT @_gonfreeecss: @txtfrombrand @txtdrjkt Seba...,rt gonfreeecss txtfrombrand txtdrjkt sebagian ...,20
...,...,...,...,...,...
29996,29997,0.0,https://t.co/OIyRkOxke6\n\nI found this mother...,find motherfuck love death peac alien tom,15
29997,29998,0.0,RT @MinisterMOFA: Our sincere congratulations ...,rt ministermofa sincer congratul ðÿºðÿ ðÿ¹ðÿ ð...,29
29998,29999,0.0,@_moonbeems ðŸ¥ºâ¤ï¸ youâ€™ll enjoy the peace,moonbeem ðÿºâï youâll enjoy peac,6
29999,30000,0.0,RT @julezlafiesta: That thing thatâ€™s botheri...,rt julezlafiesta thing thatâ bother canât cont...,20


In [None]:
from collections import Counter

In [None]:
#train and validation dataset splitting
X_train, X_valid, y_train, y_valid = train_test_split(train_data['New_Tweet'].tolist(),\
                                                      train_data['Label'].tolist(),\
                                                      test_size=0.2,\
                                                      stratify = train_data['Label'].tolist(),\
                                                      random_state=0)


print('Train data len:'+str(len(X_train)))
print('Class distribution'+str(Counter(y_train)))
print('Valid data len:'+str(len(X_valid)))
print('Class distribution'+ str(Counter(y_valid)))

Train data len:19136
Class distributionCounter({1.0: 12320, 0.0: 6816})
Valid data len:4785
Class distributionCounter({1.0: 3081, 0.0: 1704})


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# vectorize tweets for model building
vectorizer = CountVectorizer(binary=True, stop_words='english')

# learn a vocabulary dictionary of all tokens in the raw documents
vectorizer.fit(list(X_train) + list(X_valid))

# transform documents to document-term matrix
x_train_vec = vectorizer.transform(X_train)
x_test_vec = vectorizer.transform(X_valid)

In [None]:
from sklearn import tree

In [None]:
classifier=tree.DecisionTreeClassifier()
classifier=tree.DecisionTreeClassifier(random_state=0)
prob=classifier.fit(x_train_vec, y_train).predict_proba(x_test_vec)
# from sklearn.naive_bayes import GaussianNB
# prob=nav_clf.fit(x_train_vec, y_train).predict_proba(x_test_vec)
y_pred_dt = classifier.predict(x_test_vec)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, plot_confusion_matrix, classification_report, accuracy_score

dt_precision = precision_score(y_valid, y_pred_dt)
dt_recall = recall_score(y_valid, y_pred_dt)
dt_f1_score = f1_score(y_valid, y_pred_dt)
dt_f1_weighted = f1_score(y_valid, y_pred_dt, average='weighted')
dt_accuracy = accuracy_score(y_valid, y_pred_dt)

# printing scores
print("Precision: ",dt_precision)
print("Recall: ", dt_recall)
print("F1 Score: ",dt_f1_score)
print("Weighted F1 Score: ", dt_f1_weighted)
print("Accuracy: ",dt_accuracy)

Precision:  0.9582504970178927
Recall:  0.9386562804284323
F1 Score:  0.9483521888834234
Weighted F1 Score:  0.9344267043275633
Accuracy:  0.9341692789968652
