In [14]:
SEED = 7789
import pandas as pd
import numpy as np
import random
from nltk.tokenize import word_tokenize
import nltk
import string, re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def tokenize_with_lemmentize(document ,lemmentize = True):
#Tokenizer and Lemmantizer for TfidfVectorizer
#First remove url, then remove stopwords and non-alphabet, and lemmantize the lower cased tokens. 

    tokenized_post = []
    lemmatizer = WordNetLemmatizer()
    removed = stopwords.words('english') + list(string.punctuation)
    document =  re.sub(r'http\S+', '', str(document))
    tokens = word_tokenize(document)
    
    words = [word.lower() for word in tokens if word.isalpha() and word not in removed]
    if lemmentize:
        words_lemmantized = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in words]
        tokenized_post += words_lemmantized
    else:
        tokenized_post += words
    return tokenized_post
def concat_messages(df):
    df.post_name.fillna('', inplace=True)
    df.post_message.fillna('', inplace=True)
    df.post_description.fillna('', inplace=True)
    df['concat'] = df.post_name + ' ' + df.post_message + ' ' + df.post_description

In [88]:
df = pd.read_csv('/home3/r05322021/Desktop/FB_hatecrime/Data/label/immigration_label.csv', encoding='utf-8', engine='python')
df = df[(df.Mexican_related.isin([0,1])) & (df.Muslim_related.isin([0,1])) & (df.immigration_related.isin([0,1]))]

In [89]:
col = 'concat'
# y_col = 'Ethnics_related'
concat_messages(df)
target = df.dropna(subset=[col]).copy()
# target = down_sample(target, y_col)
target[col] = target[col].str.replace('\r', '').str.lower()
# target[y_col].value_counts()

In [90]:
bow = CountVectorizer(tokenizer=tokenize_with_lemmentize, 
                      token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b', 
                      min_df=5
                     )
X = bow.fit_transform(target[col])

In [91]:
tfidf = TfidfTransformer().fit(X)
tfidf_X = tfidf.transform(X)

In [92]:
tfidf_X.sum(axis=0)

matrix([[3.45616666, 1.49064704, 3.74834289, ..., 6.66254133, 3.61531977,
         1.30712094]])

In [125]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, random_state=7789, shuffle=True)
y_col = 'immigration_related'
target[y_col] = target[y_col].astype('category')
y = target[y_col].cat.codes.values
def kfold(X, y):
    for train_idx, valid_idx in kf.split(X):
        yield (X[train_idx], y[train_idx], X[valid_idx], y[valid_idx])

In [113]:
y_map = dict(enumerate(target[y_col].cat.categories))
def random_show(indexes, pred, prob):
    print(y_map)
    #for idx, p in random.sample(list(zip(indexes, pred)), k=min(len(indexes), 5)):
    for idx, p, probability in zip(indexes, pred, prob):
        row = target.iloc[idx]
        print('\n', '\n'.join([row.post_name, row.post_message, row.post_description]), 
              '\nprediction', y_map[p], 'groundtruth', row[y_col], 
              'prob', {y_map[i]: probability[i] for i in range(2)})

In [126]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
for train_idx, valid_idx in kf.split(tfidf_X):
    X_train, y_train, X_valid, y_valid = tfidf_X[train_idx], y[train_idx], tfidf_X[valid_idx], y[valid_idx]
    logit_m = LogisticRegression(
        random_state=7789, multi_class='ovr', solver='liblinear',
        class_weight={0:1, 1:2.5}, penalty='l2', C=5).fit(X_train, y_train)
    print('accuracy train', logit_m.score(X_train, y_train), 'valid', logit_m.score(X_valid, y_valid))
    
    pred = logit_m.predict(X_valid)
    print(classification_report(y_valid, pred, target_names=['No', 'Yes']))
#     errors = valid_idx[pred != y_valid]
#     random_show(errors.tolist(), pred[pred != y_valid], logit_m.predict_proba(X_valid)[pred != y_valid])
    

accuracy train 0.9385823960007141 valid 0.7987152034261242
              precision    recall  f1-score   support

          No       0.87      0.84      0.86      1008
         Yes       0.63      0.69      0.66       393

    accuracy                           0.80      1401
   macro avg       0.75      0.77      0.76      1401
weighted avg       0.81      0.80      0.80      1401

accuracy train 0.9396536332797715 valid 0.7987152034261242
              precision    recall  f1-score   support

          No       0.88      0.83      0.85       988
         Yes       0.64      0.72      0.68       413

    accuracy                           0.80      1401
   macro avg       0.76      0.77      0.77      1401
weighted avg       0.81      0.80      0.80      1401

accuracy train 0.9394858978936095 valid 0.8014285714285714
              precision    recall  f1-score   support

          No       0.88      0.83      0.86      1003
         Yes       0.63      0.72      0.67       397

    a

In [116]:
from sklearn.svm import SVC
for X_train, y_train, X_valid, y_valid in kfold(tfidf_X, y):
    svm_m = SVC(random_state=SEED, verbose=True, gamma='scale', class_weight={0:1, 1:2}).fit(X_train, y_train)
    print('\n', svm_m.score(X_train, y_train), svm_m.score(X_valid, y_valid))
    pred = svm_m.predict(X_valid)
    print(classification_report(y_valid, pred, target_names=['No', 'Yes']))

[LibSVM]
 0.963756472058561 0.7994289793004996
              precision    recall  f1-score   support

          No       0.87      0.75      0.81       781
         Yes       0.73      0.86      0.79       620

    accuracy                           0.80      1401
   macro avg       0.80      0.81      0.80      1401
weighted avg       0.81      0.80      0.80      1401

[LibSVM]
 0.9607212997678986 0.7865810135617416
              precision    recall  f1-score   support

          No       0.88      0.74      0.80       816
         Yes       0.70      0.85      0.77       585

    accuracy                           0.79      1401
   macro avg       0.79      0.80      0.79      1401
weighted avg       0.80      0.79      0.79      1401

[LibSVM]
 0.9628704034273474 0.795
              precision    recall  f1-score   support

          No       0.87      0.77      0.82       832
         Yes       0.71      0.84      0.77       568

    accuracy                           0.80      140

In [127]:
logit_m = LogisticRegression(
        random_state=7789, multi_class='ovr', solver='liblinear', class_weight={0:1, 1:2.5}).fit(tfidf_X, y)
pred = logit_m.predict(tfidf_X)
print(classification_report(y, pred, target_names=['No', 'Yes']))

              precision    recall  f1-score   support

          No       0.95      0.87      0.91      4938
         Yes       0.74      0.88      0.80      2064

    accuracy                           0.87      7002
   macro avg       0.84      0.88      0.86      7002
weighted avg       0.89      0.87      0.88      7002



In [123]:
svm_m = SVC(random_state=SEED, verbose=True, gamma='scale', class_weight={0:1, 1:13}).fit(tfidf_X, y)
pred = svm_m.predict(tfidf_X)
print(classification_report(y, pred, target_names=['No', 'Yes']))

[LibSVM]              precision    recall  f1-score   support

          No       1.00      0.99      1.00      6356
         Yes       0.93      1.00      0.96       646

    accuracy                           0.99      7002
   macro avg       0.97      1.00      0.98      7002
weighted avg       0.99      0.99      0.99      7002



In [128]:
from joblib import dump, load
dump(logit_m, f'/home3/r05322021/Desktop/model/LR/{y_col[:-8]}_logit.joblib')
# dump(svm_m, f'/home3/r05322021/Desktop/model/SVM/{y_col[:-8]}_svm.joblib')
# dump(bow, '/home3/r05322021/Desktop/model/transformation/BagOfWord_immigration.joblib')
# dump(tfidf, '/home3/r05322021/Desktop/model/transformation/TFIDF_immigration.joblib')

['/home3/r05322021/Desktop/model/LR/immigration_logit.joblib']

In [102]:
y_col[:-8]

'immigration'