**Toxic Comment Classifier**

This model takes text data as an input and identifies whether the text is toxic or not with 6 levels of toxicity.

In building this model few references have been taken into consideration mainly from 'Linebyline.ai' page from github. 
Link: https://github.com/line-by-line/toxic_comments_classifier

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
cd /content/drive/MyDrive/Datasets

/content/drive/MyDrive/Datasets


In [4]:
ls

sample_submission.csv  test.csv  test_labels.csv  train.csv


In [62]:
import pandas as pd
import tensorflow as tf
import nltk
import re
import numpy as np
np.set_printoptions(suppress=True)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score, roc_curve, f1_score, confusion_matrix
from nltk.corpus import stopwords
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import f1_score, precision_score, recall_score, precision_recall_curve, fbeta_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

In [6]:
df=pd.read_csv('train.csv')

In [7]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


**Data Analysis**

In [8]:
print(df.shape)
df.dtypes

(159571, 8)


id               object
comment_text     object
toxic             int64
severe_toxic      int64
obscene           int64
threat            int64
insult            int64
identity_hate     int64
dtype: object

In [9]:
df.isnull().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [10]:
df['comment_text']=df['comment_text'].astype('str')
df.dtypes

id               object
comment_text     object
toxic             int64
severe_toxic      int64
obscene           int64
threat            int64
insult            int64
identity_hate     int64
dtype: object

In [11]:
# Checking the distribution of multi labels data in actual numbers

multi_labels=['toxic','severe_toxic','obscene','threat','insult','identity_hate']
values_numbers={}
for key,value in df.items():
  if key in multi_labels:
    values_numbers[key + ' label division'] = df[key].value_counts()


In [12]:
values_numbers=pd.DataFrame(values_numbers)
values_numbers

Unnamed: 0,toxic label division,severe_toxic label division,obscene label division,threat label division,insult label division,identity_hate label division
0,144277,157976,151122,159093,151694,158166
1,15294,1595,8449,478,7877,1405


In [13]:
label_df=df[multi_labels]
label_df

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
...,...,...,...,...,...,...
159566,0,0,0,0,0,0
159567,0,0,0,0,0,0
159568,0,0,0,0,0,0
159569,0,0,0,0,0,0


In [14]:
# Checking the distribution of multi labels data in percentage

multi_labels=['toxic','severe_toxic','obscene','threat','insult','identity_hate']
values_percentage={}
for key,value in df.items():
  if key in multi_labels:
    values_percentage[key + ' label division (%)'] = df[key].value_counts()/len(df[key].index)*100


In [15]:
values_percentage=pd.DataFrame(values_percentage)
values_percentage

Unnamed: 0,toxic label division (%),severe_toxic label division (%),obscene label division (%),threat label division (%),insult label division (%),identity_hate label division (%)
0,90.415552,99.000445,94.705178,99.700447,95.063639,99.119514
1,9.584448,0.999555,5.294822,0.299553,4.936361,0.880486


**DATA CLEANING**

In [16]:
df_copy=df.copy()
df_copy

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [17]:
from nltk.stem import WordNetLemmatizer
import string
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [18]:
df_copy['comment_text']

0         Explanation\nWhy the edits made under my usern...
1         D'aww! He matches this background colour I'm s...
2         Hey man, I'm really not trying to edit war. It...
3         "\nMore\nI can't make any real suggestions on ...
4         You, sir, are my hero. Any chance you remember...
                                ...                        
159566    ":::::And for the second time of asking, when ...
159567    You should be ashamed of yourself \n\nThat is ...
159568    Spitzer \n\nUmm, theres no actual article for ...
159569    And it looks like it was actually you who put ...
159570    "\nAnd ... I really don't think you understand...
Name: comment_text, Length: 159571, dtype: object

Removing Line breaks, punctuations, Links, Hashtags, and etc.

In [19]:
df_copy['comment_text']=df_copy['comment_text'].apply( lambda x : re.sub("\n|\r", " ", x)) #Line breaks
df_copy['comment_text']=df_copy['comment_text'].apply( lambda x : re.sub('[^-9A-Za-z ]', '', x)) # Punctuations
df_copy['comment_text']=df_copy['comment_text'].apply( lambda x : re.sub('[%s]' % re.escape(string.punctuation), '', x.lower()))
df_copy['comment_text']=df_copy['comment_text'].apply( lambda x : re.sub(r'[^\x00-\x7f]',r'', x)) #non Ascii
df_copy['comment_text']=df_copy['comment_text'].apply( lambda x : re.sub('@\S+', '', x))# mentions @
df_copy['comment_text']=df_copy['comment_text'].apply( lambda x : re.sub('#\S+', '', x)) # Hashtags
df_copy['comment_text']=df_copy['comment_text'].apply( lambda x : re.sub('https*\S+', '', x)) #Links
# df_copy['comment_text']=df_copy['comment_text'].apply( lambda x : re.sub(r'\s+', '', x, flags=re.I))


In [20]:
#Removing stopwords and rounding up to its base word by using Lemmatization
lmt=WordNetLemmatizer()

def remove_stopwords(text):
  text=[lmt.lemmatize(word) for word in text.split() if word not in stopwords.words('english')]
  return " ".join(text)

In [21]:
df_copy['comment_text']=df_copy['comment_text'].map(remove_stopwords)

In [23]:
df_copy['comment_text']

0         explanation edits made username hardcore metal...
1         daww match background colour im seemingly stuc...
2         hey man im really trying edit war guy constant...
3         cant make real suggestion improvement wondered...
4                       sir hero chance remember page thats
                                ...                        
159566    second time asking view completely contradicts...
159567              ashamed horrible thing put talk page 99
159568    spitzer umm there actual article prostitution ...
159569    look like actually put speedy first version de...
159570    really dont think understand came idea bad rig...
Name: comment_text, Length: 159571, dtype: object

In [24]:
 #Toxic comment example
df_copy[df_copy['toxic']==1]['comment_text']

6                               cocksucker piss around work
12        hey talk exclusive group wp talibanswho good d...
16             bye dont look come think comming back tosser
42        gay antisemmitian archangel white tiger meow g...
43                                fuck filthy mother as dry
                                ...                        
159494    previous conversation fucking shit eating libe...
159514                              mischievious pubic hair
159541    absurd edits absurd edits great white shark to...
159546    hey listen dont ever delete edits ever im anno...
159554    im going keep posting stuff u deleted fucking ...
Name: comment_text, Length: 15294, dtype: object

In [25]:
#Maximum Length of a single sentence
def max_len(x):
    a=x.split()
    return len(a)

In [26]:
sen_len=df['comment_text'].apply(max_len)
print('Maximum length of each sentence')
sen_len.sort_values(ascending=False)

159571


140904    1411
4712      1403
81295     1354
35817     1344
32143     1250
          ... 
111438       1
141293       1
52475        1
106891       1
110293       1
Name: comment_text, Length: 159571, dtype: int64

Distributing the data into equal set of 0 and 1 examples.

In [27]:
df_copy_toxic=df_copy[(df_copy['toxic']==1)|(df_copy['severe_toxic']==1)|(df_copy['obscene']==1)|(df_copy['threat']==1)|(df_copy['insult']==1)|(df_copy['identity_hate']==1)]
df_copy_nontoxic=df_copy[(df_copy['toxic']==0)&(df_copy['severe_toxic']==0)&(df_copy['obscene']==0)&(df_copy['threat']==0)&(df_copy['insult']==0)&(df_copy['identity_hate']==0)].iloc[0:17000,:]

In [28]:
df_copy_toxic

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
6,0002bcb3da6cb337,cocksucker piss around work,1,1,1,0,1,0
12,0005c987bdfc9d4b,hey talk exclusive group wp talibanswho good d...,1,0,0,0,0,0
16,0007e25b2121310b,bye dont look come think comming back tosser,1,0,0,0,0,0
42,001810bf8c45bf5f,gay antisemmitian archangel white tiger meow g...,1,0,1,0,1,1
43,00190820581d90ce,fuck filthy mother as dry,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...
159494,fef4cf7ba0012866,previous conversation fucking shit eating libe...,1,0,1,0,1,1
159514,ff39a2895fc3b40e,mischievious pubic hair,1,0,0,0,1,0
159541,ffa33d3122b599d6,absurd edits absurd edits great white shark to...,1,0,1,0,1,0
159546,ffb47123b2d82762,hey listen dont ever delete edits ever im anno...,1,0,0,0,1,0


In [29]:
df_copy_nontoxic

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation edits made username hardcore metal...,0,0,0,0,0,0
1,000103f0d9cfb60f,daww match background colour im seemingly stuc...,0,0,0,0,0,0
2,000113f07ec002fd,hey man im really trying edit war guy constant...,0,0,0,0,0,0
3,0001b41b1c6bb37e,cant make real suggestion improvement wondered...,0,0,0,0,0,0
4,0001d958c54c6e35,sir hero chance remember page thats,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
18955,32110519a80f7f14,hey wiki dronebot get life asshole removing li...,0,0,0,0,0,0
18956,3211215e87ef7e93,checked article matter word matter rigorous un...,0,0,0,0,0,0
18958,321156bd48d3dfae,root bd aramaic mean work article state sh bib...,0,0,0,0,0,0
18959,321183c94c23961a,try find source source armin wenger photo,0,0,0,0,0,0


In [30]:
#Merging the two datasets (Toxic and non toxic of same shape)
df_copy2=pd.concat([df_copy_toxic,df_copy_nontoxic], axis=0)
df_copy2

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
6,0002bcb3da6cb337,cocksucker piss around work,1,1,1,0,1,0
12,0005c987bdfc9d4b,hey talk exclusive group wp talibanswho good d...,1,0,0,0,0,0
16,0007e25b2121310b,bye dont look come think comming back tosser,1,0,0,0,0,0
42,001810bf8c45bf5f,gay antisemmitian archangel white tiger meow g...,1,0,1,0,1,1
43,00190820581d90ce,fuck filthy mother as dry,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...
18955,32110519a80f7f14,hey wiki dronebot get life asshole removing li...,0,0,0,0,0,0
18956,3211215e87ef7e93,checked article matter word matter rigorous un...,0,0,0,0,0,0
18958,321156bd48d3dfae,root bd aramaic mean work article state sh bib...,0,0,0,0,0,0
18959,321183c94c23961a,try find source source armin wenger photo,0,0,0,0,0,0


In [31]:
#Random Shuffling
df_copy2=df_copy2.sample(frac=1)
df_copy2

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
81986,db554e4995b34fd8,ankit fadia ankit fadia white hat hacker seria...,1,0,0,0,0,0
14465,26349680abf14eeb,cant cal keyboard died see last response rp,0,0,0,0,0,0
15667,296054ce141a170c,utc one quoting debbie schlussel dont understa...,0,0,0,0,0,0
15878,29ebcbb5f1f24fcd,go use drug insignificant nobody eraser stalke...,1,0,0,0,1,0
98062,0c8f1b74807fe3c9,litter act sect shit try qld act amendmentsget...,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...
135728,d5d9bb056e944e75,buffalo bill vandal friend little vandal egoce...,1,0,1,0,1,0
12718,21bf589fcfe0c742,hi pterantula encyclopedia entry requires attr...,0,0,0,0,0,0
15734,298d00ead6aa2fc2,declaration slavery funny lighten esay dont ea...,1,0,1,0,0,0
105211,32dd1835749bb264,fuck bot fuck cluebot suck quit reverting shit...,1,0,1,0,1,1


In [32]:
#Max length of a single sentence in new distributed data frame
sen_len2=df_copy2['comment_text'].apply(max_len)
print(sen_len2.shape[0])
sen_len2.sort_values(ascending=False)

33225


76598     1250
32143     1250
153353    1247
32400     1235
106964    1078
          ... 
96423        1
8846         0
3990         0
2407         0
9395         0
Name: comment_text, Length: 33225, dtype: int64

**MULTI LABEL CLASSIFICATION USING TRADITIONAL MACHINE LEARNING ALGORITHMS**

In [33]:
df_copy2

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
81986,db554e4995b34fd8,ankit fadia ankit fadia white hat hacker seria...,1,0,0,0,0,0
14465,26349680abf14eeb,cant cal keyboard died see last response rp,0,0,0,0,0,0
15667,296054ce141a170c,utc one quoting debbie schlussel dont understa...,0,0,0,0,0,0
15878,29ebcbb5f1f24fcd,go use drug insignificant nobody eraser stalke...,1,0,0,0,1,0
98062,0c8f1b74807fe3c9,litter act sect shit try qld act amendmentsget...,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...
135728,d5d9bb056e944e75,buffalo bill vandal friend little vandal egoce...,1,0,1,0,1,0
12718,21bf589fcfe0c742,hi pterantula encyclopedia entry requires attr...,0,0,0,0,0,0
15734,298d00ead6aa2fc2,declaration slavery funny lighten esay dont ea...,1,0,1,0,0,0
105211,32dd1835749bb264,fuck bot fuck cluebot suck quit reverting shit...,1,0,1,0,1,1


In order to perform multi-label classification with sci-kit learn, Data should be distributed for each label.

In [56]:
#Label distribution for each class in distributed data

# toxic_data=df_copy2.loc[:,['id','comment_text','toxic']]
# severe_toxic_data=df_copy2.loc[:,['id','comment_text','severe_toxic']]
# obscene_data=df_copy2.loc[:,['id','comment_text','obscene']]
# threat_data=df_copy2.loc[:,['id','comment_text','threat']]
# insult_data=df_copy2.loc[:,['id','comment_text','insult']]
# identity_hate_data=df_copy2.loc[:,['id','comment_text','identity_hate']]

Label distribution for each class in Un-distributed data

In [91]:
toxic_data=df_copy.loc[:,['id','comment_text','toxic']]
severe_toxic_data=df_copy.loc[:,['id','comment_text','severe_toxic']]
obscene_data=df_copy.loc[:,['id','comment_text','obscene']]
threat_data=df_copy.loc[:,['id','comment_text','threat']]
insult_data=df_copy.loc[:,['id','comment_text','insult']]
identity_hate_data=df_copy.loc[:,['id','comment_text','identity_hate']]

In [71]:
toxic_data

Unnamed: 0,id,comment_text,toxic
0,0000997932d777bf,explanation edits made username hardcore metal...,0
1,000103f0d9cfb60f,daww match background colour im seemingly stuc...,0
2,000113f07ec002fd,hey man im really trying edit war guy constant...,0
3,0001b41b1c6bb37e,cant make real suggestion improvement wondered...,0
4,0001d958c54c6e35,sir hero chance remember page thats,0
...,...,...,...
159566,ffe987279560d7ff,second time asking view completely contradicts...,0
159567,ffea4adeee384e90,ashamed horrible thing put talk page 99,0
159568,ffee36eab5c267c9,spitzer umm there actual article prostitution ...,0
159569,fff125370e4aaaf3,look like actually put speedy first version de...,0


In [47]:
#Method which splits data into test and train
from sklearn.model_selection import train_test_split

def splitting_data (df):
  #splitting the data
  X_final=df.iloc[0:, 1]
  y_final=df.iloc[:,-1]

  X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.25, random_state=42)

  # Converting Text into vector form
  return X_train, X_test, y_train, y_test

In [64]:
#Method which perform vectorization

def vectorization (df, vector):

  X_train, X_test, y_train, y_test=splitting_data(df)

  if vector == 'cv' :
    cv=CountVectorizer()
    X_train_vec=cv.fit_transform(X_train)
    X_test_vec=cv.transform(X_test)

  if vector=='tf_idf':
    tf_idf=TfidfVectorizer()
    X_train_vec=tf_idf.fit_transform(X_train)
    X_test_vec=tf_idf.transform(X_test)

  # Initializing Models
  svm=LinearSVC().fit(X_train_vec,y_train)
  svm_f1=f1_score( svm.predict(X_test_vec),y_test)

  logistic_model=LogisticRegression().fit(X_train_vec,y_train)
  logistic_model_f1=f1_score( logistic_model.predict(X_test_vec),y_test)

  rf_model=RandomForestClassifier().fit(X_train_vec,y_train)
  rf_model_f1=f1_score( rf_model.predict(X_test_vec),y_test)

  accuracy= {'SVM_score':{'Accuracy':svm.score(X_test_vec, y_test), 'F1_score': svm_f1}, 'logistic_score': {'Accuracy':logistic_model.score(X_test_vec, y_test), 'F1_score':logistic_model_f1}, 'Random_Forest_score': {'Accuracy':rf_model.score(X_test_vec, y_test), 'F1_score': rf_model_f1 }}
  
  Accuracy_df=pd.DataFrame(accuracy)

  return Accuracy_df
  

In [72]:
X_train, X_test, y_train, y_test= splitting_data(toxic_data)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((119678,), (39893,), (119678,), (39893,))

In [51]:
# X_train_vec.shape, X_test_vec.shape

In [73]:
acc_toxic=vectorization(toxic_data, 'tf_idf')

In [74]:
acc

Unnamed: 0,SVM_score,logistic_score,Random_Forest_score
Accuracy,0.877332,0.874684,0.850969
F1_score,0.864368,0.859,0.826416


In [84]:
# acc_severe=vectorization(severe_toxic_data, 'tf_idf')
# acc_obscene=vectorization(obscene_data, 'tf_idf')
# acc_threat=vectorization(threat_data, 'tf_idf')
# acc_insult=vectorization(insult_data, 'tf_idf')
# acc_identity=vectorization(identity_hate_data, 'tf_idf')

Choosing a model 

In [116]:
  X_final=obscene_data.iloc[0:, 1]
  y_final=obscene_data.iloc[:,-1]

  X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.25, random_state=42)

  tf_idf=TfidfVectorizer()
  X_train_vec=tf_idf.fit_transform(X_train)
  X_test_vec=tf_idf.transform(X_test)

  # Initializing Models
  svm=LinearSVC().fit(X_train_vec,y_train)
  svm_f1=f1_score( svm.predict(X_test_vec),y_test)

  randomforest = RandomForestClassifier(n_estimators=100, random_state=42)
  randomforest.fit(X_train_vec, y_train)
  randomforest.predict(X_test_vec)


array([0, 0, 0, ..., 0, 0, 0])

In [117]:
svm.predict(X_test_vec)

array([0, 0, 0, ..., 0, 0, 0])

In [118]:
  rf_model_f1=f1_score( randomforest.predict(X_test_vec),y_test)

In [119]:
svm_f1, rf_model_f1

(0.7837203235063919, 0.740885054272196)

So far it is clear that SVM model is giving better accuracy and being choosen as the final model.

For pickling, I have used the existing method provided by 'LinebyLine.ai' in the following link https://github.com/line-by-line/toxic_comments_classifier/blob/master/Toxic%20Comments%20Classifier.ipynb .

In [127]:
def pickle_model(df, label):
    
    X_final=df.iloc[0:, 1]
    y_final=df.iloc[:,-1]

    # Initiate a Tfidf vectorizer
    tfv = TfidfVectorizer(stop_words='english')
    
    # Convert the X data into a document term matrix dataframe
    X_train_vec = tfv.fit_transform(X_final)  
    
    # saves the column labels (ie. the vocabulary)
    # wb means Writing to the file in Binary mode, written in byte objects
    with open(r"{}.pkl".format(label + '_vect'), "wb") as f:   
        pickle.dump(tfv, f)   
        
    rf=RandomForestClassifier().fit(X_train_vec,y_final)
    
    # Create a new pickle file based on random forest
    with open(r"{}.pkl".format(label + '_model'), "wb") as f:  
        pickle.dump(rf, f)

In [128]:
import pickle
datasets = [toxic_data, severe_toxic_data, obscene_data, threat_data, insult_data, identity_hate_data]
label = ['toxic', 'severe_toxic', 'obscene', 'insult', 'threat', 'identity_hate']

for i,j in zip(datasets,label):
    pickle_model(i, j)

For the sake of increasing the accuracy of our model, I tried to apply Deep learning techniques by training own word embedding and word3vec models.

However, these approaches are not validated as I ended up with getting similar results.

**Corpus Creation**

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [None]:
train_sentences=df_copy2.comment_text
train_sentences.shape

(33225,)

In [None]:
# lmt=WordNetLemmatizer()

In [None]:
# # Removing Stopwords and applying lemmatization
# corpus=[] # Creating a corpus which tokenizes the text data
# for text in df_copy.comment_text:
#   text=[lmt.lemmatize(word.lower()) for word in text.split() if word not in stopwords.words('english')]
#   text= " ".join(text)
#   corpus.append(text)

In [None]:
# corpus[0]

**Training own embedding using Keras**

In [None]:
#Counting the number of occurance of each word
from collections import Counter
def word_counter(text):
   count=Counter()
   for sentence in text:
     for word in sentence.split():
       count[word]=count[word]+1
   return count

In [None]:
text=df_copy2.comment_text
counter=word_counter(text)
len(counter)

67440

In [None]:
counter

In [None]:
n_words=len(counter)

In [None]:
tokens=Tokenizer(num_words=n_words)

In [None]:
tokens.fit_on_texts(text)

In [None]:
word_index=tokens.word_index

In [None]:
len(word_index)

67440

In [None]:
token_sequence=tokens.texts_to_sequences(text)

In [None]:
token_sequence[0]

[201,
 161,
 1118,
 117,
 70,
 1,
 2751,
 9612,
 82,
 13455,
 49,
 863,
 2568,
 129,
 29487,
 5257,
 473,
 186,
 96,
 15,
 1868,
 5,
 5076,
 21,
 1606]

In [None]:
max_length=1000

In [None]:
#Applying padding sequence for making all sentences equal
embedded_doc=pad_sequences(token_sequence, maxlen=max_length, padding='post', truncating='post')

In [None]:
print(embedded_doc)

[[  201   161  1118 ...     0     0     0]
 [ 1058  9613 10522 ...     0     0     0]
 [  350   886  6327 ...     0     0     0]
 ...
 [  443  1796   279 ...     0     0     0]
 [  163  1432  5209 ...     0     0     0]
 [   15   192    31 ...     0     0     0]]


In [None]:
n_labels=6

In [None]:
from tensorflow.keras.layers import Dropout

In [None]:
# model
embedding_vector_features=50
model1=Sequential()
model1.add(Embedding(n_words,embedding_vector_features,input_length=max_length))
# model1.add(Dense(50, kernel_initializer='he_uniform'))
model1.add(LSTM(128, dropout=0.1))

#Adding hidden dropout layers
# model1.add(Dropout(0.5))
# model1.add(LSTM(64))
# model1.add(Dropout(0.5))
model1.add(Dense(6, activation='softmax'))

model1.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model1.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1000, 50)          3372000   
_________________________________________________________________
lstm (LSTM)                  (None, 128)               91648     
_________________________________________________________________
dense (Dense)                (None, 6)                 774       
Total params: 3,464,422
Trainable params: 3,464,422
Non-trainable params: 0
_________________________________________________________________


In [None]:
len(embedded_doc)

33225

Splitting the Data sets

In [None]:
import numpy as np
X_final=np.array(embedded_doc)
y=df_copy2[multi_labels]
y_final=np.array(y)
# print(X_final[6],y_final[6])
print(X_final.shape,y_final.shape)

(33225, 1000) (33225, 6)


In [None]:

#Splitting the data set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.3, random_state=42)

In [None]:
X_train[6]

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((23257, 1000), (23257, 6), (9968, 1000), (9968, 6))

In [None]:
#Model Training
model1.fit(X_train,y_train,
           validation_data=(X_test,y_test),
           batch_size=32,
           epochs=2)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7fb44522f3d0>

In [None]:
y_pred=model1.predict(X_test)

In [None]:
y_pred[0:10]

array([[0.53276205, 0.03472301, 0.21825829, 0.0079515 , 0.18432619,
        0.02197899],
       [0.53276205, 0.03472301, 0.2182583 , 0.00795149, 0.1843262 ,
        0.02197899],
       [0.53276205, 0.03472301, 0.21825832, 0.00795149, 0.18432622,
        0.02197899],
       [0.53276205, 0.03472301, 0.21825829, 0.0079515 , 0.18432622,
        0.02197899],
       [0.53276205, 0.03472301, 0.2182583 , 0.00795149, 0.1843262 ,
        0.02197899],
       [0.53276205, 0.03472301, 0.21825829, 0.00795149, 0.18432622,
        0.02197899],
       [0.53276205, 0.03472301, 0.2182583 , 0.00795149, 0.1843262 ,
        0.02197899],
       [0.53276205, 0.03472301, 0.21825832, 0.00795149, 0.18432622,
        0.02197899],
       [0.53276205, 0.03472301, 0.21825829, 0.00795149, 0.18432622,
        0.02197899],
       [0.53276205, 0.03472301, 0.2182583 , 0.0079515 , 0.1843262 ,
        0.02197899]], dtype=float32)

In [None]:
y_proba=model1.predict_proba(X_test)



In [None]:
y_proba[0]

array([0.53276205, 0.03472301, 0.21825829, 0.0079515 , 0.18432619,
       0.02197899], dtype=float32)

In [None]:
test_sent=['You are a stupid motherfucker','This is an apple']
# tokens_test=Tokenizer(num_words=50)

In [None]:
tokens_test.fit_on_texts(test_sent)

In [None]:
t_sequence=tokens.texts_to_sequences(test_sent)

In [None]:
t_sequence

[[81, 947], [14607, 3818]]

In [None]:
test_embedded_doc=pad_sequences(t_sequence, maxlen=max_length, padding='post', truncating='post')

In [None]:
x=np.array(test_embedded_doc)
x

array([[   81,   947,     0, ...,     0,     0,     0],
       [14607,  3818,     0, ...,     0,     0,     0]], dtype=int32)

In [None]:
s=model1.predict(X_train[6])
s.shape

(1000, 6)

# **Training own Word2vec Model**

In [None]:
tokenized_tweet = df_copy['comment_text'].apply(lambda x: x.split()) # tokenizing

In [None]:
tokenized_tweet[0]

['explanation',
 'edits',
 'made',
 'username',
 'hardcore',
 'metallica',
 'fan',
 'reverted',
 'werent',
 'vandalism',
 'closure',
 'gas',
 'voted',
 'new',
 'york',
 'doll',
 'fac',
 'please',
 'dont',
 'remove',
 'template',
 'talk',
 'page',
 'since',
 'im',
 'retired',
 'now9']

In [None]:
len(tokenized_tweet)

159571

In [None]:
from gensim.models import Word2Vec
import nltk
import numpy as np
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
model_w2v=Word2Vec(tokenized_tweet, min_count=3)

In [None]:
model_w2v

<gensim.models.word2vec.Word2Vec at 0x7fb44852f890>

In [None]:
model_w2v.train(tokenized_tweet, total_examples= len(df_copy['comment_text']), epochs=20)

(101718942, 109590540)

In [None]:
words=model_w2v[model_w2v.wv.vocab]
words.shape

  """Entry point for launching an IPython kernel.


(56680, 100)

In [None]:
model_w2v.wv.most_similar(positive="dinner")

[('meal', 0.6671488285064697),
 ('laughter', 0.612054705619812),
 ('lunch', 0.5855309367179871),
 ('breakfast', 0.5773997902870178),
 ('wine', 0.5702009797096252),
 ('thanksgiving', 0.5493760108947754),
 ('restaurant', 0.5480619668960571),
 ('celebrate', 0.5438820123672485),
 ('drink', 0.5411736369132996),
 ('coffee', 0.5395969152450562)]

In [None]:
model_w2v.wv.most_similar('bastard')

[('prick', 0.6835364699363708),
 ('tosser', 0.663966715335846),
 ('filthy', 0.6569685935974121),
 ('commie', 0.6292790770530701),
 ('loser', 0.6182609796524048),
 ('ecto', 0.6150409579277039),
 ('hanibal9youre', 0.6105689406394958),
 ('slapper', 0.6029486656188965),
 ('scum', 0.6008085012435913),
 ('retard', 0.599929928779602)]

In [None]:
def word_vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model_w2v[word].reshape((1, size))
            count += 1.
        except KeyError: # handling the case where the token is not in vocabulary
                         
            continue
    if count != 0:
        vec /= count
    return vec

In [None]:
wordvec_arrays = np.zeros((len(tokenized_tweet), 100))

for i in range(len(tokenized_tweet)):
    wordvec_arrays[i,:] = word_vector(tokenized_tweet[i], 100)
    
wordvec_df = pd.DataFrame(wordvec_arrays)
wordvec_df.shape

  


(159571, 100)

In [None]:
wordvec_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,-0.042393,-0.266596,-0.542382,-0.472387,-0.326446,0.492492,-0.674238,0.031103,-0.078385,0.053402,-0.213681,0.184533,-0.702228,-0.366960,0.311131,0.208850,0.147575,-0.058842,0.359058,0.576771,0.083936,-0.566892,0.510082,0.498563,-0.265674,-0.454897,0.268060,0.614659,0.133895,-0.389771,-0.687697,0.373140,0.422805,0.703419,0.127338,0.117446,-0.425781,0.068350,-0.720529,0.837612,...,0.655523,-0.212812,0.277528,-0.897911,0.158041,-0.040538,-0.158093,-0.211901,0.058915,0.015522,-1.233338,0.247065,-0.182583,0.227809,-0.277291,-0.764322,-0.186407,0.094022,-0.487705,0.407668,0.121348,0.043126,-0.346308,1.177114,0.148149,0.204937,-0.046141,-0.187629,-0.677044,-0.653860,0.512245,-0.674734,0.597736,0.465102,0.800832,-0.049003,0.682565,0.224744,-0.158548,0.400044
1,0.042776,0.769257,-0.534479,-0.458499,-0.482915,0.337564,0.382991,-0.267815,0.484626,0.339221,0.132441,-0.134220,-0.284833,-0.465669,0.590517,0.185661,0.106298,0.832315,0.710558,-0.391099,0.591829,0.258446,0.431710,1.131688,-0.826284,-1.046071,0.517689,0.339877,0.499298,-0.072435,-0.177317,0.429720,0.408830,0.235232,0.638388,-0.180402,-0.561656,-0.368239,-0.768297,1.211242,...,0.330883,-0.351999,-0.086055,-0.331878,-0.146087,-0.617440,-0.414525,-0.180322,-0.731779,1.484863,-0.058415,0.161016,0.277114,0.078595,0.435155,-0.827538,0.125246,0.332498,-0.175819,0.351711,-0.052820,0.264178,-0.108113,-0.256685,0.020180,0.721932,0.544950,0.342161,-0.212417,-0.133128,0.028694,-0.446446,-0.813203,0.604336,0.678082,0.149801,0.533961,0.061949,-0.258430,0.265638
2,-0.522589,0.114569,0.431791,-0.398795,-0.241518,0.456064,0.319502,-0.343745,-0.416592,0.674393,0.337848,0.080295,-0.768550,0.111642,-0.090020,0.230241,-0.184860,-0.941268,0.148332,-0.165027,0.659888,-0.192202,0.976797,0.567914,-0.426153,-0.928917,0.824482,0.051350,0.526476,0.144075,-0.717416,-0.169927,0.555874,0.070422,-0.059264,0.198260,-0.056576,-0.251441,0.139741,1.498244,...,0.993241,0.059024,0.080887,-1.129185,0.094610,0.147757,-0.467052,-1.025221,-0.604192,0.313416,-0.482065,0.378184,-0.203597,0.201701,-0.052521,-0.584281,-0.248406,0.144632,-0.607251,-0.182508,0.561825,-0.273579,0.029707,1.467472,-0.341308,-0.088675,0.655556,-0.091501,-0.670451,-0.603300,1.496794,-0.277824,0.138840,0.432965,0.335506,-0.159509,1.373224,-0.013391,-0.554896,0.228185
3,-0.129685,0.012110,-0.399763,0.147647,-0.321566,1.342853,0.024797,-0.019528,0.099430,0.884714,0.327770,0.519433,-1.185143,0.192327,0.854991,-0.043323,0.147071,-0.390543,0.594644,0.120655,0.154786,-0.623761,0.494850,0.576021,-0.815082,-0.704128,0.409457,0.075465,0.584707,0.491554,-0.734645,0.185978,0.598278,0.118420,0.242071,0.029967,-0.830474,-0.038148,0.081158,1.443755,...,0.416300,-0.428367,0.344655,-1.022718,-0.249847,-0.158515,-0.220469,-0.719193,-0.189532,-0.005368,-0.319584,-0.101570,-0.018407,0.232137,0.152378,-0.565468,-0.313281,0.086904,-0.877762,-0.069227,0.163978,0.437505,0.227616,0.669895,0.019056,0.622238,0.492101,0.434318,-0.521018,-0.816610,0.530074,0.809173,0.358515,0.768966,0.301405,0.239032,0.494625,-0.267202,-0.921375,0.558884
4,0.405896,-0.302498,-0.027782,-0.579789,-0.856559,-0.231605,0.327730,0.230679,-0.584870,0.285647,0.074496,-0.506991,-0.619314,-0.214336,-0.672246,-0.157722,0.125682,-0.297523,-0.301998,-0.813917,0.340038,0.520582,1.000861,0.706049,-0.777594,-1.455639,-0.877743,-1.041520,-0.007729,-0.005987,-0.862645,-0.137763,0.611269,0.321842,-0.087831,-0.538717,0.462809,1.380364,-0.867766,0.723132,...,0.891470,-0.070149,0.573725,-0.251865,-0.209287,-0.343292,-1.352250,-0.301036,-0.802370,-0.312740,-0.340300,0.066075,-0.484198,0.163744,-0.068958,0.568179,-0.025925,0.140922,-0.196547,0.255460,0.642835,-0.050075,0.230042,0.237529,-0.645893,0.161264,-0.395470,0.104292,0.013761,0.046796,0.382459,-0.452338,-0.593433,0.893366,-0.300483,0.356036,0.438115,0.065280,-0.099608,0.444399
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159566,-0.510485,0.216705,-0.015656,0.121589,-0.339742,1.338184,0.105381,0.213534,0.822716,0.678602,-0.313981,-0.127033,-1.331515,0.229426,0.492469,0.001151,-0.163017,-0.965023,-0.657151,0.449369,-0.406256,0.774905,0.312791,-0.013527,-1.064700,-0.583629,0.879459,-0.554354,1.182123,0.342750,-0.765646,-0.237614,-0.254165,-0.324925,0.383052,0.757617,0.122128,0.150335,0.195347,1.644333,...,1.225624,-0.060396,0.574494,-1.207805,-0.297595,0.020707,0.466131,-0.825015,-0.449466,-0.878305,-0.404528,0.684702,-0.715841,0.258301,-0.460953,0.824066,-0.773020,0.221756,-0.282607,-0.790437,0.190119,-0.051097,0.415826,0.860017,-0.206203,0.488409,-0.055761,0.643996,-0.636970,-0.738182,0.873831,0.752251,0.861921,1.635840,-0.872651,-0.037035,0.649196,-0.631826,-0.717702,0.051341
159567,0.081488,-0.498773,0.764046,-0.436740,-0.297739,0.966306,-0.124069,0.086902,0.016516,0.599731,0.146819,0.504232,-0.561746,0.171733,-0.779099,0.605739,0.161095,0.230423,-0.145528,-0.061994,1.032135,-0.160121,1.235367,0.739037,-1.027188,-0.897694,-0.176218,-0.402334,0.021950,-0.089296,-0.856175,-0.652493,0.408585,-0.037494,-0.047442,-0.386827,-0.562368,-0.124930,-0.239087,1.318023,...,0.904233,-0.479097,0.340081,-0.998738,-0.074322,-0.200986,-1.049309,0.736117,-0.073976,-0.059456,-0.341028,0.569800,0.198757,0.241210,-0.135481,-0.572104,-0.348404,-0.345836,-0.638557,0.074879,-0.240529,-0.087999,0.264984,0.689507,0.308759,-0.367909,-0.083293,-0.124133,0.014218,0.630731,0.028414,-0.234569,-0.211892,0.572309,-0.188599,-0.136985,1.511574,0.423671,-0.479400,0.115040
159568,0.082960,-0.153329,0.115622,0.290357,-0.257353,0.325823,-0.323688,-0.201838,-0.259384,-0.072013,-0.662914,-0.277582,-0.269339,0.296375,0.475149,-0.430156,0.393003,0.074526,-0.165118,-0.750220,0.362579,0.434588,-0.476240,0.124483,-0.208219,-0.393642,0.026546,-0.455732,0.499025,0.159729,-0.654044,0.423032,-0.775423,-0.258388,0.053996,0.055592,0.032421,-0.318796,-0.329750,-0.120033,...,0.415304,-0.334459,-0.238608,-0.177880,-0.833788,-0.368345,-0.218342,-0.090639,0.150749,0.001236,-0.292386,0.124220,-0.230605,0.024439,0.685773,0.088198,-0.244098,-0.072034,-0.381074,-0.377652,0.290846,-0.542612,0.275710,0.660884,0.521454,0.093697,-0.485760,0.166106,-0.499607,-1.073984,-0.093419,0.798541,0.252973,0.444291,-0.090714,0.241036,0.372785,-0.886916,-0.228867,-0.167962
159569,-0.784958,-0.477689,0.962255,-0.278146,0.981380,0.739516,0.850752,-0.119028,0.324418,1.202149,1.249980,0.301821,-0.729192,-0.026782,-0.478074,0.986063,0.757819,0.592056,0.065974,0.408321,0.494744,0.669181,1.524876,0.506469,-1.278466,-1.214669,0.050262,-0.055061,0.595700,-0.356135,-1.742989,0.402901,1.630029,0.910216,0.225530,-0.249871,-0.539772,0.195809,0.478134,0.375094,...,1.093023,-0.724666,1.408666,-1.741063,-0.238928,-0.436923,-0.830976,-0.518971,0.236042,-0.004138,-1.692565,-0.885432,-0.725594,0.582718,-0.710749,-0.378505,-0.694530,0.449719,-0.710061,-0.056255,0.510385,0.679257,0.277426,0.771955,1.407828,0.607631,1.034705,-0.217348,0.012477,-0.593937,0.014694,0.625953,-0.435043,1.326391,0.835512,0.486105,2.173981,0.458142,-1.172675,0.777546


In [None]:
model_w2v.wv.syn0.shape

  """Entry point for launching an IPython kernel.


(56680, 100)

In [None]:
import numpy as np
np.set_printoptions(suppress=True)

x_w2v=np.array(wordvec_df)
y_w2v=np.array(df_copy[multi_labels])
print(y_w2v.shape, x_w2v.shape)

(159571, 6) (159571, 100)


In [None]:
y_w2v

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [None]:
#Splitting the data set
from sklearn.model_selection import train_test_split
X_train1, X_test1, y_train1, y_test1 = train_test_split(x_w2v, y_w2v, test_size=0.3, random_state=42)

In [None]:
print(X_train1.shape, X_test1.shape, y_train1.shape, y_test1.shape )

(111699, 100) (47872, 100) (111699, 6) (47872, 6)


In [None]:
y_train1[7]

array([1, 1, 1, 1, 1, 0])

In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Input,Embedding,Dense,Flatten
from sklearn.metrics import accuracy_score,classification_report
from sklearn.metrics import f1_score

epochs = 25
batch_size = 1024
loss = "binary_crossentropy"
optimizer = "adam"
metrics = ["accuracy"]

# Build neural network
model = models.Sequential()
model.add(Dense(100, activation='relu', input_shape=(100,)))
model.add(Dense(512, activation='relu'))
model.add(Dense(6, activation='sigmoid'))
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_13 (Dense)             (None, 100)               10100     
_________________________________________________________________
dense_14 (Dense)             (None, 512)               51712     
_________________________________________________________________
dense_15 (Dense)             (None, 6)                 3078      
Total params: 64,890
Trainable params: 64,890
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(loss=loss,optimizer=optimizer,metrics= metrics)

model.fit(X_train1,y_train1,epochs=epochs,batch_size=batch_size,validation_data=(X_test1,y_test1))



Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7fb40c851990>

In [None]:

predictions = model.predict(X_test1)
# predictions = [0 if i<0.5 else 1 for i in predictions]

predictions[0:10]
# f1_score(y_test1, predictions)

array([[0.2895664 , 0.0076918 , 0.17042056, 0.00138131, 0.12348351,
        0.08823591],
       [0.00024009, 0.00000003, 0.00002617, 0.00000008, 0.00015441,
        0.00000018],
       [0.01149571, 0.00001156, 0.00171801, 0.00022948, 0.00077978,
        0.00000873],
       [0.00036663, 0.00000012, 0.00002587, 0.00000004, 0.0000223 ,
        0.00001657],
       [0.00038269, 0.00000001, 0.00000591, 0.        , 0.00001635,
        0.00000046],
       [0.00236821, 0.00001168, 0.00115061, 0.00000023, 0.00012624,
        0.00003474],
       [0.0025306 , 0.0000004 , 0.00024188, 0.        , 0.00004764,
        0.00000061],
       [0.00150481, 0.00000027, 0.00121206, 0.00000048, 0.01227781,
        0.0000192 ],
       [0.00132152, 0.00000209, 0.00045952, 0.00000034, 0.00003496,
        0.00001037],
       [0.00006074, 0.00000002, 0.00001111, 0.0000006 , 0.00000874,
        0.00000025]], dtype=float32)

In [None]:
np.argmax(predictions)

1356

In [None]:

predictions.shape

(47872, 6)

In [None]:
prob=model.predict_proba(X_test1)



In [None]:
prob[0]

array([0.2256237 , 0.00506327, 0.06383434, 0.00327659, 0.06253624,
       0.20952314], dtype=float32)

Testing with own sentences

In [None]:
sentence=['hey motherfucker, what are you doing?' ]

In [None]:
test_df=pd.DataFrame(sentence)

In [None]:
test_df[0]

0    hey motherfucker, what are you doing?
Name: 0, dtype: object

In [None]:
#cleaning Test data
test_df[0]=test_df[0].apply( lambda x : re.sub("\n|\r", " ", x)) #Line breaks
test_df[0]=test_df[0].apply( lambda x : re.sub('[^-9A-Za-z ]', '', x)) # Punctuations
test_df[0]=test_df[0].apply( lambda x : re.sub('[%s]' % re.escape(string.punctuation), '', x.lower()))
test_df[0]=test_df[0].apply( lambda x : re.sub(r'[^\x00-\x7f]',r'', x)) #non Ascii
test_df[0]=test_df[0].apply( lambda x : re.sub('@\S+', '', x))# mentions @
test_df[0]=test_df[0].apply( lambda x : re.sub('#\S+', '', x)) # Hashtags
test_df[0]=test_df[0].apply( lambda x : re.sub('https*\S+', '', x)) #Links

test_df[0]=test_df[0].map(remove_stopwords)

In [None]:
test_df[0]

0    hey motherfucker
Name: 0, dtype: object

In [None]:
tokenized_sent = test_df[0].apply(lambda x: x.split())

In [None]:
l=list(model_w2v.wv.vocab)

In [None]:
l[1]

'edits'