In [1]:
import os, json, re
from pathlib import Path
import pandas as pd
import numpy as np
import glob

from nltk.tokenize import TweetTokenizer
import nltk
from nltk import pos_tag
from six import string_types

pd.set_option('display.max_columns', None)
# import emoji

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV

from sklearn.metrics import classification_report as clsr
from sklearn.metrics import confusion_matrix as cm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
def strip_ASCII(text):
    return re.sub("([^\x00-\x7F])+"," ", text)

def prediction(X, y, n, cv=False, k=10):

    labels = LabelEncoder()
    y = labels.fit_transform( np.asarray(y) )#.reshape(-1,1)
    names = labels.classes_
    print("shape of X:", X.shape)
    print("shape of y:", y.shape)
    
    X_train, X_test, y_train, y_test = tts(X, y, test_size=n)
    print("shape of X_train:", X_train.shape)
    print("shape of y_train:", y_train.shape)
    print("shape of X_test:", X_test.shape)
    print("shape of y_test:", y_test.shape)
    
    if cv:
        lg = LogisticRegressionCV(cv=k, random_state=0)        
    else:
        lg = LogisticRegression()
        
    lg.fit(X_train, y_train)
    y_pred = lg.predict(X_test).reshape(-1,1)
    print("shape of y_pred:", y_pred.shape)
    print(clsr(y_test, y_pred, target_names=names))
    print(cm(y_test, y_pred, labels=[0,1]))
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print('naive model (only no)')
    y_naive = np.array(['no']*len(y_test))
    y_naive = labels.fit_transform(y_naive)#.reshape(-1,1)
    print(clsr(y_test, y_naive, target_names=names))
    print(cm(y_test, y_naive, labels=[1,0]))

    return acc, f1

In [3]:
# path_to_json = Path('C:\\Users\\niti.mishra\\Documents\\Personal\\cyberbullying\\data\\labelled_tweets')
path_to_json = Path('/Users/peaceforlives/Documents/Projects/cyberbullying/data/labelled_tweets/ab')
json_pattern = os.path.join(path_to_json,'*.json')
file_list = glob.glob(json_pattern)
# file_list = file_list[:-2] 
file_list

['/Users/peaceforlives/Documents/Projects/cyberbullying/data/labelled_tweets/ab/random_tweets_3a.json',
 '/Users/peaceforlives/Documents/Projects/cyberbullying/data/labelled_tweets/ab/random_tweets_6a.json',
 '/Users/peaceforlives/Documents/Projects/cyberbullying/data/labelled_tweets/ab/random_tweets_2a.json',
 '/Users/peaceforlives/Documents/Projects/cyberbullying/data/labelled_tweets/ab/random_tweets_8a.json',
 '/Users/peaceforlives/Documents/Projects/cyberbullying/data/labelled_tweets/ab/random_tweets_4b.json',
 '/Users/peaceforlives/Documents/Projects/cyberbullying/data/labelled_tweets/ab/random_tweets_1b.json',
 '/Users/peaceforlives/Documents/Projects/cyberbullying/data/labelled_tweets/ab/random_tweets_7b.json',
 '/Users/peaceforlives/Documents/Projects/cyberbullying/data/labelled_tweets/ab/random_tweets_5b.json']

In [4]:
tweets = pd.read_json(file_list[0], orient="records", lines=True, encoding='utf-8-sig')
tweets.head()

Unnamed: 0,id,full_tweet,bullying_trace,bullying_role,form_of_bullying,bullying_post_type
0,1175775344231628800,everyone take note!!!! most people forget they...,no,,,
1,1160966582656327680,@chaewona_ @royalbiink hey not generalizing y'...,no,,,
2,1171979110802849792,@breaking911 bully gives clothes and films it ...,no,,,
3,1164840982149586944,"the god of the old testament is ""jealous&amp;p...",no,,,
4,1168206761553055744,yet the war criminal wants to spunk away £129 ...,no,,,


In [5]:
target = 'bullying_trace'
tweets = tweets[['id', 'full_tweet', target]]

# tweets['full_tweet'] = [ [strip_ASCII(token) for token in tweet] for tweet in tweets['full_tweet'] ]
# tweets.head()

In [6]:
# def strip_emoji(text):
# #     print(emoji.emoji_count(text))
#     new_text = re.sub(emoji.get_emoji_regexp(), r"", text)
#     return new_text


# tweets['full_tweet'] = [ [strip_emoji(token) for token in tweet] for tweet in tweets['full_tweet'] ]
# tweets.head()

In [7]:
# def strip_repeat(text):  
# #     return re.sub(r'(.)\1+', r'\1\1', text) 
#     return re.sub(r'(\w)\1+', r'\1', text)

# strip_repeat('heheehehe')
# # tweets['full_tweet'] = [ [strip_ASCII(token) for token in tweet] for tweet in tweets['full_tweet'] ]
# # tweets.head()

In [8]:
# tweets['len'] = tweets['full_tweet'].apply(len)
# tweets.head()

In [9]:
# tweets.groupby('bullying_trace')['len'].describe()

## Count Vectorizer ## 

In [10]:
tweets.head()

Unnamed: 0,id,full_tweet,bullying_trace
0,1175775344231628800,everyone take note!!!! most people forget they...,no
1,1160966582656327680,@chaewona_ @royalbiink hey not generalizing y'...,no
2,1171979110802849792,@breaking911 bully gives clothes and films it ...,no
3,1164840982149586944,"the god of the old testament is ""jealous&amp;p...",no
4,1168206761553055744,yet the war criminal wants to spunk away £129 ...,no


In [11]:
cv = CountVectorizer()
cv_fit = cv.fit_transform([' '.join(tweet) for tweet in tweets['full_tweet'] ])

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [None]:
words = np.asarray(cv.get_feature_names())
count = np.asarray( cv_fit.toarray().sum(axis=0) )
corpusdictionary = dict(zip(words,count))

count = pd.DataFrame.from_dict(corpusdictionary, orient='index', columns=['count'])
count = count.sort_values(by=['count'], ascending=False)
# count.to_csv('count_'+target+'.csv', index=True)
count

In [None]:
X = cv_fit
X.shape

y = tweets[target]
freq = y.value_counts()           # count frequency of different classes in loan status
freq/sum(freq)*100   

In [None]:
acc, f1 = prediction(X, y, n=0.2)
print(acc, f1)

In [None]:
acc, f1 = prediction(X, y, n=0.2, cv=True)
print(acc, f1)

In [None]:
idx = range(100, X.shape[0], 500)
scores = { }
for i in idx: 
    print(i)
    acc, f1 = prediction(X[:i,], y[:i], n=0.2, cv=True)
    scores[i] = [acc , f1]

In [None]:
scores

In [None]:
labels = LabelEncoder()
y = labels.fit_transform( np.asarray(y) )
names = labels.classes_
print("shape of X:", X.shape)
print("shape of y:", y.shape)

X_train, X_test, y_train, y_test =tts(X, y, random_state=0, stratify=y, shuffle=True)
print("shape of X_train:", X_train.shape)
print("shape of y_train:", y_train.shape)
print("shape of X_test:", X_test.shape)
print("shape of y_test:", y_test.shape)

scores_copy = { }
idx = range(200, X_train.shape[0], 300)
for i in idx:
    lg = LogisticRegressionCV(cv=10, random_state=0, max_iter=1000)         
    lg.fit(X_train[:i,], y_train[:i])
    y_pred = lg.predict(X_test)
    print("shape of y_pred:", y_pred.shape)
    
    print(clsr(y_test, y_pred))
    print(cm(y_test, y_pred))#, labels=names))
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, labels=np.unique(y_pred))
    
    scores_copy[i] = [acc , f1]

scores_copy

## TDIDF Vectorizer ## 

In [None]:
def identity(words):
    return words
vectorizer = TfidfVectorizer(tokenizer=identity, encoding='utf-8', preprocessor=None, use_idf=True,
                             lowercase=False, ngram_range=(1,2)
#                              , stop_words='english',
#                              min_df=5, max_df=0.8
                            )

In [None]:
tfidf_fit = vectorizer.fit_transform([tweet for tweet in tweets['full_tweet']])
X = tfidf_fit
X.shape

y = tweets[target]
freq = y.value_counts()           # count frequency of different classes in loan status
freq/sum(freq)*100  

In [None]:
# X = pd.DataFrame(X)
# X = pd.concat( [X, tweets['len']], axis=1)
# X.shape

In [None]:
df_idf = pd.DataFrame(vectorizer.idf_, index=vectorizer.get_feature_names(), columns=['idf_weights'])
weights = df_idf.sort_values(by=['idf_weights'], ascending=False)
weights.to_csv('tdidfweights_'+target+'.csv', index=True)
# weights.to_csv('tdidfweights_noemoji.csv', index=True)
weights
# the lower the idf value of a word, the less unique it is to any particular document
# terms with higher weight scores are considered to be more important

In [None]:
# tdidf score of first tweet
# if a word occurs multiple times in a document, we should boost its relevance as it should be 
# more meaningful than other words that appear fewer times (TF)
# On the other hand, if a word occurs many times in all documents, maybe it is just a frequent word
vector = pd.DataFrame(X[1].T.todense(), index=vectorizer.get_feature_names(), columns=['tdidf'])
vector.sort_values(by=['tdidf'], ascending=False)

In [None]:
D = vectorizer.vocabulary_
max_word = max(D, key=D.get)
max_value = max(D.values())
print(max_word, max_value)

In [None]:
acc, f1 = prediction(X, y, n=0.2)
print(acc, f1)

In [None]:
acc, f1 = prediction(X, y, n=0.2, cv=True)
print(acc, f1)

In [None]:
idx = range(100, X.shape[0], 500)
scores_tfidf = { }
for i in idx: 
    print(i)
    acc, f1 = prediction(X[:i,], y[:i], n=0.2, cv=True)
    scores_tfidf[i] = [acc , f1]

In [None]:
scores_tfidf

In [None]:
tfidf_fit = vectorizer.fit_transform([tweet for tweet in tweets['full_tweet']])
X = tfidf_fit
X.shape

y = tweets[target]
freq = y.value_counts()           # count frequency of different classes in loan status
freq/sum(freq)*100  

labels = LabelEncoder()
y = labels.fit_transform( np.asarray(y) )
names = labels.classes_
print("shape of X:", X.shape)
print("shape of y:", y.shape)

X_train, X_test, y_train, y_test =tts(X, y, random_state=0, stratify=y, shuffle=True)
print("shape of X_train:", X_train.shape)
print("shape of y_train:", y_train.shape)
print("shape of X_test:", X_test.shape)
print("shape of y_test:", y_test.shape)

scores_copy = { }
idx = range(200, X_train.shape[0], 200)
for i in idx:
    lg = LogisticRegressionCV(cv=10, random_state=0, max_iter=1000)         
    lg.fit(X_train[:i,], y_train[:i])
    y_pred = lg.predict(X_test)
    print("shape of y_pred:", y_pred.shape)
    
    print(clsr(y_test, y_pred))
    print(cm(y_test, y_pred))#, labels=names))
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, labels=np.unique(y_pred))
    
    scores_copy[i] = [acc , f1]

scores_copy

In [None]:
print(clsr(y_test, y_pred))
print(cm(y_test, y_pred))#, labels=names))
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, labels=np.unique(y_pred))

## Hashing Vectorizer ##

In [None]:
def listToString(s):  
    
    # initialize an empty string 
    str1 = ""  
    
    # traverse in the string   
    for ele in s:  
        str1 += ' '+ele   
    
    # return string   
    return str1  

text = [ listToString(tweet) for tweet in tweets['full_tweet']]

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer

hash_vectorizer = HashingVectorizer(n_features=5000)
hash_fit = hash_vectorizer.transform(text)

X = hash_fit
X.shape

In [None]:
y = tweets[target]
freq = y.value_counts()           # count frequency of different classes in loan status
freq/sum(freq)*100  

In [None]:
acc, f1 = prediction(X, y, n=0.2)
print(acc, f1)

In [None]:
acc, f1 = prediction(X, y, n=0.2, cv=True)
print(acc, f1)