In [19]:
!pip install pandas 
!pip install scikit-learn
!pip install ekphrasis
!pip install emoji
!pip install joblib



In [20]:
import pandas as pd
from tweet_preprocess import preprocess_tweets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import joblib

In [21]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/nilesh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Read data

In [22]:
df = pd.read_json('Dataset for Detection of Cyber-Trolls.json', lines= True)

# Extract 1 or 0 from {'notes': '', 'label': ['1']}	
df["annotation"] = df["annotation"].apply(lambda x: x["label"]).astype("str")
df["annotation"] = df["annotation"].apply(lambda x: x.split("'")[1])

# Drop extra columns
df = df.drop(columns=['extras'])

# Lets see some of the data
df.head()

Unnamed: 0,content,annotation
0,Get fucking real dude.,1
1,She is as dirty as they come and that crook ...,1
2,why did you fuck it up. I could do it all day...,1
3,Dude they dont finish enclosing the fucking s...,1
4,WTF are you talking about Men? No men thats n...,1


# Preprocess text

In [23]:
df["content"] = df["content"].apply(lambda x:preprocess_tweets(x))

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df["content"].values, 
                                                    df["annotation"].values, 
                                                    test_size=0.25, 
                                                    random_state=42, 
                                                    shuffle=True,
                                                    stratify = df["annotation"])
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(15000,) (5001,) (15000,) (5001,)


In [25]:
pipe = Pipeline([('count', CountVectorizer(ngram_range=(1, 2))), ('tfidf', TfidfTransformer())]).fit(X_train)
joblib.dump(pipe, 'sklearn_pipeline.pkl')

['sklearn_pipeline.pkl']

In [26]:
X_train = pipe.transform(X_train)
X_test = pipe.transform(X_test)

In [27]:
classifier = MultinomialNB() 
classifier.fit(X_train,y_train)      #training the model
joblib.dump(classifier, 'ml_model.pkl')

['ml_model.pkl']

In [28]:
y_pred = classifier.predict(X_test)  #Predicting our test label

print(classification_report(y_test,y_pred))   #Results
print(accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.96      0.92      3045
           1       0.92      0.83      0.87      1956

    accuracy                           0.90      5001
   macro avg       0.91      0.89      0.90      5001
weighted avg       0.91      0.90      0.90      5001

0.904619076184763


# Predict for new tweets

In [29]:
## Now tweet df is generated here, but in live app it will be fetched from Twitter API
new_tweet = ["something unique ohio happy :)", "dude dont finish enclosing fucking showers hat"]
new_tweet_user = ["@user1", "@user2"]
new_tweet_location = ["#location", "#location"]
new_tweet_df = pd.DataFrame({"content" : new_tweet, "user" : new_tweet_user, "location" : new_tweet_location})

# Preprocess tweets
new_tweet_df["processed_content"] = new_tweet_df["content"].apply(lambda x:preprocess_tweets(x))
new_tweet_df

Unnamed: 0,content,user,location,processed_content
0,something unique ohio happy :),@user1,#location,something unique ohio happy happy
1,dude dont finish enclosing fucking showers hat,@user2,#location,dude dont finish enclosing fucking showers hat


In [30]:
new_tweet_df["predicted_annotation"] = classifier.predict(pipe.transform(new_tweet_df["processed_content"].values))

In [31]:
new_tweet_df

Unnamed: 0,content,user,location,processed_content,predicted_annotation
0,something unique ohio happy :),@user1,#location,something unique ohio happy happy,0
1,dude dont finish enclosing fucking showers hat,@user2,#location,dude dont finish enclosing fucking showers hat,1
