In [1]:
import os, json, re
from pathlib import Path
import pandas as pd
import numpy as np
import glob
pd.set_option('display.max_columns', None)
import emoji

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV

from sklearn.metrics import classification_report as clsr
from sklearn.metrics import confusion_matrix as cm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [17]:
def strip_ASCII(text):
    return re.sub("([^\x00-\x7F])+"," ", text)

def prediction(X, y, n=0.2, cv=False, k=10, binary=True):

    labels = LabelEncoder()
    y = labels.fit_transform( np.asarray(y) )#.reshape(-1,1)
    names = labels.classes_
    print("shape of X:", X.shape)
    print("shape of y:", y.shape)
    
    X_train, X_test, y_train, y_test =tts(X, y, random_state=0, stratify=y, shuffle=True)
    print("shape of X_train:", X_train.shape)
    print("shape of y_train:", y_train.shape)
    print("shape of X_test:", X_test.shape)
    print("shape of y_test:", y_test.shape)
    
    if cv:
        lg = LogisticRegressionCV(cv=k, random_state=0, max_iter=1000)         
    else:
        lg = LogisticRegression()
        
    lg.fit(X_train, y_train)
    y_pred = lg.predict(X_test).reshape(-1,1)
    print("shape of y_pred:", y_pred.shape)
    print(clsr(y_test, y_pred)) #, target_names=names))
    print(cm(y_test, y_pred, labels=[0,1,2,3,4]))
    acc = accuracy_score(y_test, y_pred)
    if not binary:
        f1 = f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
    else:
        f1 = f1_score(y_test, y_pred, labels=np.unique(y_pred))

    print('naive model (only no)')
    y_naive = np.array(['report']*len(y_test))
    y_naive = labels.fit_transform(y_naive)#.reshape(-1,1)
    print(clsr(y_test, y_naive))# target_names=names))
    print(cm(y_test, y_naive, labels=[0,1,2,3,4]))

    return acc, f1

In [3]:
path_to_json = Path('C:\\Users\\niti.mishra\\Documents\\Personal\\cyberbullying\\data\\labelled_tweets')
json_pattern = os.path.join(path_to_json,'*.json')
file_list = glob.glob(json_pattern)
# bullying_post_type = file_list[:-2] 
file_list

['C:\\Users\\niti.mishra\\Documents\\Personal\\cyberbullying\\data\\labelled_tweets\\all_tweets.json']

In [4]:
tweets = pd.read_json(file_list[0],lines=True)
tweets.head()

Unnamed: 0,id,full_tweet,bullying_trace,bullying_role,form_of_bullying,bullying_post_type
0,1168174144724160518,"[@user, bullying, and, help, be, two, differen...",yes,bully,general,denial
1,1162980168232853504,"[@user, @user, i, mean, it, have, some, really...",no,,,
2,1161905202527526912,"[my, take, on, social, issue, like, this, be, ...",yes,reporter,general,report
3,1162217363023880193,"[i, want, to, watch, a, group, of, character, ...",no,,,
4,1168710966659272705,"[i, d, never, let, a, fucking, fetus, on, twit...",no,,,


In [5]:
target = 'bullying_post_type'
tweets = tweets[['id', 'full_tweet', target]]
tweets.dropna(inplace=True)

tweets['full_tweet'] = [ [strip_ASCII(token) for token in tweet] for tweet in tweets['full_tweet'] ]
tweets.head()

Unnamed: 0,id,full_tweet,bullying_post_type
0,1168174144724160518,"[@user, bullying, and, help, be, two, differen...",denial
2,1161905202527526912,"[my, take, on, social, issue, like, this, be, ...",report
8,1163814523683270656,"[my, sister, be, bully, this, girl, yang, baru...",cyberbullying
11,1160014315828723713,"[@user, you, re, give, someone, point, for, de...",cyberbullying
14,1170474542415745024,"[if, i, say, something, about, you, i, will, s...",cyberbullying


In [6]:
print(tweets.shape)
tweets[target].value_counts()

(3066, 3)


report             1269
accusation          857
self-disclosure     850
denial               46
cyberbullying        44
Name: bullying_post_type, dtype: int64

In [55]:
# def strip_emoji(text):
# #     print(emoji.emoji_count(text))
#     new_text = re.sub(emoji.get_emoji_regexp(), r"", text)
#     return new_text


# tweets['full_tweet'] = [ [strip_emoji(token) for token in tweet] for tweet in tweets['full_tweet'] ]
# tweets.head()

In [56]:
# def strip_repeat(text):  
# #     return re.sub(r'(.)\1+', r'\1\1', text) 
#     return re.sub(r'(\w)\1+', r'\1', text)

# strip_repeat('heheehehe')
# # tweets['full_tweet'] = [ [strip_ASCII(token) for token in tweet] for tweet in tweets['full_tweet'] ]
# # tweets.head()

In [57]:
# tweets['len'] = tweets['full_tweet'].apply(len)
# tweets.head()

In [58]:
# tweets.groupby('bullying_trace')['len'].describe()

## Count Vectorizer ## 

In [11]:
cv = CountVectorizer()
cv_fit = cv.fit_transform([' '.join(tweet) for tweet in tweets['full_tweet'] ])

In [12]:
words = np.asarray(cv.get_feature_names())
count = np.asarray( cv_fit.toarray().sum(axis=0) )
corpusdictionary = dict(zip(words,count))

count = pd.DataFrame.from_dict(corpusdictionary, orient='index', columns=['count'])
count = count.sort_values(by=['count'], ascending=False)
count.to_csv('count_'+target+'.csv', index=True)
count

Unnamed: 0,count
be,4764
user,3643
to,2959
and,2942
bully,2911
...,...
imitate,1
imi,1
imessage,1
imbecile,1


In [31]:
X = cv_fit
X.shape

y = tweets[target]
freq = y.value_counts()           # count frequency of different classes in loan status
freq/sum(freq)*100   

report             41.389432
accusation         27.951729
self-disclosure    27.723418
denial              1.500326
cyberbullying       1.435095
Name: bullying_post_type, dtype: float64

In [22]:
acc, f1 = prediction(X, y, n=0.2, binary=False)
print(acc, f1)

shape of X: (3066, 7217)
shape of y: (3066,)
shape of X_train: (2299, 7217)
shape of y_train: (2299,)
shape of X_test: (767, 7217)
shape of y_test: (767,)
shape of y_pred: (767, 1)
                 precision    recall  f1-score   support

     accusation       0.54      0.55      0.55       214
  cyberbullying       1.00      0.09      0.17        11
         denial       0.00      0.00      0.00        12
         report       0.66      0.70      0.68       317
self-disclosure       0.70      0.69      0.70       213

       accuracy                           0.64       767
      macro avg       0.58      0.41      0.42       767
   weighted avg       0.63      0.64      0.63       767

[[118   0   1  71  24]
 [  7   1   0   2   1]
 [  6   0   0   4   2]
 [ 60   0   1 221  35]
 [ 26   0   0  39 148]]
naive model (only no)
                 precision    recall  f1-score   support

     accusation       0.28      1.00      0.44       214
  cyberbullying       0.00      0.00      0.00    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
acc, f1 = prediction(X, y, n=0.2, cv=True, binary=False)
print(acc, f1)

shape of X: (3066, 7217)
shape of y: (3066,)
shape of X_train: (2299, 7217)
shape of y_train: (2299,)
shape of X_test: (767, 7217)
shape of y_test: (767,)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

shape of y_pred: (767, 1)
                 precision    recall  f1-score   support

     accusation       0.55      0.53      0.54       214
  cyberbullying       0.00      0.00      0.00        11
         denial       0.00      0.00      0.00        12
         report       0.64      0.74      0.68       317
self-disclosure       0.76      0.70      0.73       213

       accuracy                           0.65       767
      macro avg       0.39      0.39      0.39       767
   weighted avg       0.63      0.65      0.64       767

[[114   0   0  80  20]
 [  8   0   0   2   1]
 [  6   0   0   4   2]
 [ 60   0   0 233  24]
 [ 18   0   0  46 149]]
naive model (only no)
                 precision    recall  f1-score   support

     accusation       0.28      1.00      0.44       214
  cyberbullying       0.00      0.00      0.00        11
         denial       0.00      0.00      0.00        12
         report       0.00      0.00      0.00       317
self-disclosure       0.00      0.

  _warn_prf(average, modifier, msg_start, len(result))


In [34]:
idx = range(200, X.shape[0], 200)
scores = { }
for i in idx: 
    print(i)
    acc, f1 = prediction(X[:i,], y[:i], n=0.2, cv=True, binary=False)
    scores[i] = [acc , f1]

200
shape of X: (200, 7217)
shape of y: (200,)
shape of X_train: (150, 7217)
shape of y_train: (150,)
shape of X_test: (50, 7217)
shape of y_test: (50,)




shape of y_pred: (50, 1)
                 precision    recall  f1-score   support

     accusation       0.62      0.57      0.59        14
  cyberbullying       0.00      0.00      0.00         3
         denial       0.00      0.00      0.00         1
         report       0.33      0.21      0.26        14
self-disclosure       0.50      0.78      0.61        18

       accuracy                           0.50        50
      macro avg       0.29      0.31      0.29        50
   weighted avg       0.45      0.50      0.46        50

[[ 8  0  0  2  4]
 [ 2  0  0  1  0]
 [ 1  0  0  0  0]
 [ 1  0  0  3 10]
 [ 1  0  0  3 14]]
naive model (only no)
                 precision    recall  f1-score   support

     accusation       0.28      1.00      0.44        14
  cyberbullying       0.00      0.00      0.00         3
         denial       0.00      0.00      0.00         1
         report       0.00      0.00      0.00        14
self-disclosure       0.00      0.00      0.00        18

  

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (100, 1)
                 precision    recall  f1-score   support

     accusation       0.53      0.74      0.62        34
  cyberbullying       0.00      0.00      0.00         5
         denial       0.00      0.00      0.00         3
         report       0.59      0.37      0.45        27
self-disclosure       0.58      0.68      0.63        31

       accuracy                           0.56       100
      macro avg       0.34      0.36      0.34       100
   weighted avg       0.52      0.56      0.53       100

[[25  0  0  2  7]
 [ 3  0  0  0  2]
 [ 3  0  0  0  0]
 [11  0  0 10  6]
 [ 5  0  0  5 21]]
naive model (only no)
                 precision    recall  f1-score   support

     accusation       0.34      1.00      0.51        34
  cyberbullying       0.00      0.00      0.00         5
         denial       0.00      0.00      0.00         3
         report       0.00      0.00      0.00        27
self-disclosure       0.00      0.00      0.00        31

 

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (150, 1)
                 precision    recall  f1-score   support

     accusation       0.60      0.65      0.63        46
  cyberbullying       0.00      0.00      0.00         6
         denial       0.00      0.00      0.00         3
         report       0.57      0.60      0.58        43
self-disclosure       0.63      0.65      0.64        52

       accuracy                           0.60       150
      macro avg       0.36      0.38      0.37       150
   weighted avg       0.56      0.60      0.58       150

[[30  0  0  6 10]
 [ 4  0  0  2  0]
 [ 1  0  0  1  1]
 [ 8  0  0 26  9]
 [ 7  0  0 11 34]]
naive model (only no)
                 precision    recall  f1-score   support

     accusation       0.31      1.00      0.47        46
  cyberbullying       0.00      0.00      0.00         6
         denial       0.00      0.00      0.00         3
         report       0.00      0.00      0.00        43
self-disclosure       0.00      0.00      0.00        52

 

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (200, 1)
                 precision    recall  f1-score   support

     accusation       0.62      0.58      0.60        57
  cyberbullying       0.00      0.00      0.00         6
         denial       0.00      0.00      0.00         4
         report       0.66      0.64      0.65        59
self-disclosure       0.67      0.81      0.74        74

       accuracy                           0.66       200
      macro avg       0.39      0.41      0.40       200
   weighted avg       0.62      0.66      0.64       200

[[33  0  0  9 15]
 [ 3  0  0  2  1]
 [ 2  0  0  1  1]
 [ 9  0  0 38 12]
 [ 6  0  0  8 60]]
naive model (only no)
                 precision    recall  f1-score   support

     accusation       0.28      1.00      0.44        57
  cyberbullying       0.00      0.00      0.00         6
         denial       0.00      0.00      0.00         4
         report       0.00      0.00      0.00        59
self-disclosure       0.00      0.00      0.00        74

 

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (250, 1)
                 precision    recall  f1-score   support

     accusation       0.67      0.60      0.63        70
  cyberbullying       0.00      0.00      0.00         7
         denial       0.00      0.00      0.00         4
         report       0.64      0.64      0.64        77
self-disclosure       0.67      0.80      0.73        92

       accuracy                           0.66       250
      macro avg       0.40      0.41      0.40       250
   weighted avg       0.63      0.66      0.64       250

[[42  0  0 15 13]
 [ 4  0  0  1  2]
 [ 1  0  0  1  2]
 [ 9  0  0 49 19]
 [ 7  0  0 11 74]]
naive model (only no)
                 precision    recall  f1-score   support

     accusation       0.28      1.00      0.44        70
  cyberbullying       0.00      0.00      0.00         7
         denial       0.00      0.00      0.00         4
         report       0.00      0.00      0.00        77
self-disclosure       0.00      0.00      0.00        92

 

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (300, 1)
                 precision    recall  f1-score   support

     accusation       0.65      0.59      0.62        88
  cyberbullying       0.00      0.00      0.00         7
         denial       0.00      0.00      0.00         5
         report       0.63      0.68      0.65        96
self-disclosure       0.73      0.82      0.77       104

       accuracy                           0.67       300
      macro avg       0.40      0.42      0.41       300
   weighted avg       0.64      0.67      0.66       300

[[52  0  0 25 11]
 [ 2  0  0  1  4]
 [ 2  0  0  1  2]
 [16  0  0 65 15]
 [ 8  0  0 11 85]]
naive model (only no)
                 precision    recall  f1-score   support

     accusation       0.29      1.00      0.45        88
  cyberbullying       0.00      0.00      0.00         7
         denial       0.00      0.00      0.00         5
         report       0.00      0.00      0.00        96
self-disclosure       0.00      0.00      0.00       104

 

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (350, 1)
                 precision    recall  f1-score   support

     accusation       0.62      0.60      0.61       106
  cyberbullying       0.00      0.00      0.00         8
         denial       0.00      0.00      0.00         6
         report       0.60      0.69      0.64       114
self-disclosure       0.74      0.74      0.74       116

       accuracy                           0.65       350
      macro avg       0.39      0.41      0.40       350
   weighted avg       0.63      0.65      0.64       350

[[64  0  0 30 12]
 [ 3  0  0  3  2]
 [ 1  0  0  4  1]
 [20  0  0 79 15]
 [15  0  0 15 86]]
naive model (only no)
                 precision    recall  f1-score   support

     accusation       0.30      1.00      0.46       106
  cyberbullying       0.00      0.00      0.00         8
         denial       0.00      0.00      0.00         6
         report       0.00      0.00      0.00       114
self-disclosure       0.00      0.00      0.00       116

 

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (400, 1)
                 precision    recall  f1-score   support

     accusation       0.62      0.60      0.61       124
  cyberbullying       0.00      0.00      0.00         8
         denial       0.00      0.00      0.00         7
         report       0.59      0.67      0.63       134
self-disclosure       0.71      0.72      0.72       127

       accuracy                           0.64       400
      macro avg       0.39      0.40      0.39       400
   weighted avg       0.62      0.64      0.63       400

[[74  0  0 38 12]
 [ 5  0  0  1  2]
 [ 3  0  0  2  2]
 [23  0  0 90 21]
 [14  0  0 21 92]]
naive model (only no)
                 precision    recall  f1-score   support

     accusation       0.31      1.00      0.47       124
  cyberbullying       0.00      0.00      0.00         8
         denial       0.00      0.00      0.00         7
         report       0.00      0.00      0.00       134
self-disclosure       0.00      0.00      0.00       127

 

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (450, 1)
                 precision    recall  f1-score   support

     accusation       0.60      0.58      0.59       140
  cyberbullying       0.00      0.00      0.00         9
         denial       0.00      0.00      0.00         7
         report       0.58      0.68      0.62       154
self-disclosure       0.73      0.69      0.71       140

       accuracy                           0.63       450
      macro avg       0.38      0.39      0.39       450
   weighted avg       0.61      0.63      0.62       450

[[ 81   0   0  44  15]
 [  6   0   0   2   1]
 [  3   0   0   3   1]
 [ 31   0   0 105  18]
 [ 15   0   0  28  97]]
naive model (only no)
                 precision    recall  f1-score   support

     accusation       0.31      1.00      0.47       140
  cyberbullying       0.00      0.00      0.00         9
         denial       0.00      0.00      0.00         7
         report       0.00      0.00      0.00       154
self-disclosure       0.00      0.

  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://s

shape of y_pred: (500, 1)
                 precision    recall  f1-score   support

     accusation       0.63      0.61      0.62       148
  cyberbullying       0.00      0.00      0.00         9
         denial       0.00      0.00      0.00         8
         report       0.62      0.71      0.66       183
self-disclosure       0.71      0.68      0.69       152

       accuracy                           0.65       500
      macro avg       0.39      0.40      0.40       500
   weighted avg       0.63      0.65      0.64       500

[[ 91   0   0  43  14]
 [  6   0   0   1   2]
 [  2   0   0   4   2]
 [ 29   0   0 130  24]
 [ 17   0   0  32 103]]
naive model (only no)
                 precision    recall  f1-score   support

     accusation       0.30      1.00      0.46       148
  cyberbullying       0.00      0.00      0.00         9
         denial       0.00      0.00      0.00         8
         report       0.00      0.00      0.00       183
self-disclosure       0.00      0.

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (550, 1)
                 precision    recall  f1-score   support

     accusation       0.56      0.53      0.54       158
  cyberbullying       0.00      0.00      0.00        10
         denial       0.00      0.00      0.00         9
         report       0.59      0.74      0.66       211
self-disclosure       0.74      0.63      0.68       162

       accuracy                           0.62       550
      macro avg       0.38      0.38      0.38       550
   weighted avg       0.61      0.62      0.61       550

[[ 83   0   0  63  12]
 [  6   0   0   1   3]
 [  3   0   0   5   1]
 [ 34   0   0 157  20]
 [ 22   0   0  38 102]]
naive model (only no)
                 precision    recall  f1-score   support

     accusation       0.29      1.00      0.45       158
  cyberbullying       0.00      0.00      0.00        10
         denial       0.00      0.00      0.00         9
         report       0.00      0.00      0.00       211
self-disclosure       0.00      0.

  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://s

shape of y_pred: (600, 1)
                 precision    recall  f1-score   support

     accusation       0.57      0.55      0.56       174
  cyberbullying       0.00      0.00      0.00        10
         denial       0.00      0.00      0.00         9
         report       0.61      0.69      0.65       233
self-disclosure       0.70      0.69      0.70       174

       accuracy                           0.63       600
      macro avg       0.38      0.39      0.38       600
   weighted avg       0.61      0.63      0.62       600

[[ 95   0   0  63  16]
 [  5   0   0   1   4]
 [  3   0   0   5   1]
 [ 42   0   0 161  30]
 [ 22   0   0  32 120]]
naive model (only no)
                 precision    recall  f1-score   support

     accusation       0.29      1.00      0.45       174
  cyberbullying       0.00      0.00      0.00        10
         denial       0.00      0.00      0.00         9
         report       0.00      0.00      0.00       233
self-disclosure       0.00      0.

  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://s

shape of y_pred: (650, 1)
                 precision    recall  f1-score   support

     accusation       0.55      0.58      0.57       192
  cyberbullying       0.00      0.00      0.00        10
         denial       0.00      0.00      0.00         9
         report       0.63      0.68      0.65       252
self-disclosure       0.69      0.65      0.67       187

       accuracy                           0.62       650
      macro avg       0.37      0.38      0.38       650
   weighted avg       0.61      0.62      0.61       650

[[112   0   0  63  17]
 [  3   0   0   3   4]
 [  5   0   0   1   3]
 [ 50   0   0 171  31]
 [ 32   0   0  33 122]]
naive model (only no)
                 precision    recall  f1-score   support

     accusation       0.30      1.00      0.46       192
  cyberbullying       0.00      0.00      0.00        10
         denial       0.00      0.00      0.00         9
         report       0.00      0.00      0.00       252
self-disclosure       0.00      0.

  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://s

shape of y_pred: (700, 1)
                 precision    recall  f1-score   support

     accusation       0.54      0.56      0.55       201
  cyberbullying       0.00      0.00      0.00        11
         denial       0.00      0.00      0.00        10
         report       0.63      0.69      0.66       281
self-disclosure       0.73      0.68      0.70       197

       accuracy                           0.63       700
      macro avg       0.38      0.38      0.38       700
   weighted avg       0.61      0.63      0.62       700

[[112   0   0  70  19]
 [  6   0   0   4   1]
 [  3   0   0   3   4]
 [ 62   0   0 193  26]
 [ 26   0   0  38 133]]
naive model (only no)
                 precision    recall  f1-score   support

     accusation       0.29      1.00      0.45       201
  cyberbullying       0.00      0.00      0.00        11
         denial       0.00      0.00      0.00        10
         report       0.00      0.00      0.00       281
self-disclosure       0.00      0.

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (750, 1)
                 precision    recall  f1-score   support

     accusation       0.56      0.56      0.56       212
  cyberbullying       0.00      0.00      0.00        11
         denial       0.00      0.00      0.00        11
         report       0.65      0.71      0.68       308
self-disclosure       0.75      0.74      0.75       208

       accuracy                           0.65       750
      macro avg       0.39      0.40      0.40       750
   weighted avg       0.63      0.65      0.64       750

[[118   0   0  74  20]
 [  8   0   0   2   1]
 [  6   0   0   3   2]
 [ 63   0   0 218  27]
 [ 17   0   0  37 154]]
naive model (only no)
                 precision    recall  f1-score   support

     accusation       0.28      1.00      0.44       212
  cyberbullying       0.00      0.00      0.00        11
         denial       0.00      0.00      0.00        11
         report       0.00      0.00      0.00       308
self-disclosure       0.00      0.

  _warn_prf(average, modifier, msg_start, len(result))


In [35]:
scores

{200: [0.5, 0.4979346075754394],
 400: [0.56, 0.5727523632022875],
 600: [0.6, 0.6186672771039412],
 800: [0.655, 0.6684384943797278],
 1000: [0.66, 0.6720354264010413],
 1200: [0.6733333333333333, 0.6846866608173142],
 1400: [0.6542857142857142, 0.6679673441488694],
 1600: [0.64, 0.652310514494858],
 1800: [0.6288888888888889, 0.641190908340896],
 2000: [0.648, 0.6592713298799591],
 2200: [0.6218181818181818, 0.6315527750730913],
 2400: [0.6266666666666667, 0.6360779718693844],
 2600: [0.6230769230769231, 0.6327999550935809],
 2800: [0.6257142857142857, 0.6360345147342796],
 3000: [0.6533333333333333, 0.6630034137293007]}

## TDIDF Vectorizer ## 

In [7]:
def identity(words):
    return words
vectorizer = TfidfVectorizer(tokenizer=identity, encoding='utf-8', preprocessor=None, use_idf=True,
                             lowercase=False, ngram_range=(1,2))
#                              , stop_words='english')#,
#                              min_df=5, max_df=0.7)

In [8]:
tfidf_fit = vectorizer.fit_transform([tweet for tweet in tweets['full_tweet']])
X = tfidf_fit
X.shape

(3066, 57608)

In [9]:
df_idf = pd.DataFrame(vectorizer.idf_, index=vectorizer.get_feature_names(), columns=['idf_weights'])
weights = df_idf.sort_values(by=['idf_weights'], ascending=False)
weights.to_csv('tdidfweights_'+target+'.csv', index=True)
# weights.to_csv('tdidfweights_noemoji.csv', index=True)
# weights
# the lower the idf value of a word, the less unique it is to any particular document
# terms with higher weight scores are considered to be more important

In [10]:
# tdidf score of first tweet
# if a word occurs multiple times in a document, we should boost its relevance as it should be 
# more meaningful than other words that appear fewer times (TF)
# On the other hand, if a word occurs many times in all documents, maybe it is just a frequent word
vector = pd.DataFrame(X[1].T.todense(), index=vectorizer.get_feature_names(), columns=['tdidf'])
vector = vector.sort_values(by=['tdidf'], ascending=False)

In [11]:
D = vectorizer.vocabulary_
max_word = max(D, key=D.get)
max_value = max(D.values())
print(max_word, max_value)

~ you 57607


In [18]:
tfidf_fit = vectorizer.fit_transform([tweet for tweet in tweets['full_tweet']])
X = tfidf_fit
X.shape

y = tweets[target]
freq = y.value_counts()           # count frequency of different classes in loan status
freq/sum(freq)*100   

labels = LabelEncoder()
y = labels.fit_transform( np.asarray(y) )#.reshape(-1,1)
names = labels.classes_
print("shape of X:", X.shape)
print("shape of y:", y.shape)

# X_train, X_test, y_train, y_test =tts(X, y, random_state=0, stratify=y, shuffle=True)
# print("shape of X_train:", X_train.shape)
# print("shape of y_train:", y_train.shape)
# print("shape of X_test:", X_test.shape)
# print("shape of y_test:", y_test.shape)

lg = LogisticRegressionCV(cv=10, random_state=0, max_iter=1000)         
lg.fit(X, y)
y_pred = lg.predict(X)#.reshape(-1,1)
print("shape of y_pred:", y_pred.shape)

In [None]:
print(clsr(y, y_pred, target_names=names))
print(cm(y, y_pred, labels=[0,1,2,3,4]))
# acc = accuracy_score(y_test, y_pred)
# if not binary:
#     f1 = f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
# else:
#     f1 = f1_score(y_test, y_pred, labels=np.unique(y_pred))

# print('naive model (only no)')
# y_naive = np.array(['report']*len(y_test))
# y_naive = labels.fit_transform(y_naive)#.reshape(-1,1)
# print(clsr(y_test, y_naive, target_names=names))
# print(cm(y_test, y_naive, labels=[1,0]))

In [41]:
acc, f1 = prediction(X, y, n=0.2, binary=False)
print(acc, f1)

shape of X: (3066, 57608)
shape of y: (3066,)
shape of X_train: (2299, 57608)
shape of y_train: (2299,)
shape of X_test: (767, 57608)
shape of y_test: (767,)
shape of y_pred: (767, 1)
                 precision    recall  f1-score   support

     accusation       0.65      0.43      0.52       214
  cyberbullying       0.00      0.00      0.00        11
         denial       0.00      0.00      0.00        12
         report       0.63      0.88      0.73       317
self-disclosure       0.82      0.70      0.76       213

       accuracy                           0.68       767
      macro avg       0.42      0.40      0.40       767
   weighted avg       0.67      0.68      0.66       767

[[ 91   0   0 105  18]
 [  7   0   0   4   0]
 [  5   0   0   5   2]
 [ 25   0   0 278  14]
 [ 11   0   0  52 150]]
naive model (only no)
                 precision    recall  f1-score   support

     accusation       0.28      1.00      0.44       214
  cyberbullying       0.00      0.00      0.00 

  _warn_prf(average, modifier, msg_start, len(result))


In [42]:
acc, f1 = prediction(X, y, n=0.2, cv=True, binary=False)
print(acc, f1)

shape of X: (3066, 57608)
shape of y: (3066,)
shape of X_train: (2299, 57608)
shape of y_train: (2299,)
shape of X_test: (767, 57608)
shape of y_test: (767,)
shape of y_pred: (767, 1)
                 precision    recall  f1-score   support

     accusation       0.64      0.55      0.59       214
  cyberbullying       0.00      0.00      0.00        11
         denial       0.00      0.00      0.00        12
         report       0.66      0.82      0.74       317
self-disclosure       0.80      0.72      0.76       213

       accuracy                           0.69       767
      macro avg       0.42      0.42      0.42       767
   weighted avg       0.68      0.69      0.68       767

[[117   0   0  78  19]
 [  7   0   0   4   0]
 [  7   0   0   3   2]
 [ 39   0   0 261  17]
 [ 12   0   0  47 154]]
naive model (only no)
                 precision    recall  f1-score   support

     accusation       0.28      1.00      0.44       214
  cyberbullying       0.00      0.00      0.00 

  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
idx = range(300, X.shape[0], 200)
scores_tfidf = { }
for i in idx: 
    print(i)
    acc, f1 = prediction(X[:i,], y[:i], n=0.2, cv=True, binary=False)
    scores_tfidf[i] = [acc , f1]

300
shape of X: (300, 57608)
shape of y: (300,)
shape of X_train: (225, 57608)
shape of y_train: (225,)
shape of X_test: (75, 57608)
shape of y_test: (75,)




shape of y_pred: (75, 1)
              precision    recall  f1-score   support

           0       0.60      0.72      0.65        25
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00         2
           3       0.50      0.32      0.39        19
           4       0.64      0.84      0.72        25

    accuracy                           0.60        75
   macro avg       0.35      0.38      0.35        75
weighted avg       0.54      0.60      0.56        75

[[18  0  0  3  4]
 [ 1  0  0  1  2]
 [ 1  0  0  0  1]
 [ 8  0  0  6  5]
 [ 2  0  0  2 21]]
naive model (only no)
              precision    recall  f1-score   support

           0       0.33      1.00      0.50        25
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00        19
           4       0.00      0.00      0.00        25

    accuracy                           0.33    

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (125, 1)
              precision    recall  f1-score   support

           0       0.68      0.62      0.65        40
           1       0.00      0.00      0.00         5
           2       0.00      0.00      0.00         3
           3       0.72      0.62      0.67        37
           4       0.61      0.85      0.71        40

    accuracy                           0.66       125
   macro avg       0.40      0.42      0.40       125
weighted avg       0.62      0.66      0.63       125

[[25  0  0  4 11]
 [ 1  0  0  3  1]
 [ 1  0  0  1  1]
 [ 5  0  0 23  9]
 [ 5  0  0  1 34]]
naive model (only no)
              precision    recall  f1-score   support

           0       0.32      1.00      0.48        40
           1       0.00      0.00      0.00         5
           2       0.00      0.00      0.00         3
           3       0.00      0.00      0.00        37
           4       0.00      0.00      0.00        40

    accuracy                           0.32   

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (175, 1)
              precision    recall  f1-score   support

           0       0.70      0.62      0.65        52
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         3
           3       0.68      0.52      0.59        50
           4       0.65      0.92      0.76        64

    accuracy                           0.67       175
   macro avg       0.41      0.41      0.40       175
weighted avg       0.64      0.67      0.64       175

[[32  0  0  9 11]
 [ 2  0  0  1  3]
 [ 1  0  0  0  2]
 [ 8  0  0 26 16]
 [ 3  0  0  2 59]]
naive model (only no)
              precision    recall  f1-score   support

           0       0.30      1.00      0.46        52
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         3
           3       0.00      0.00      0.00        50
           4       0.00      0.00      0.00        64

    accuracy                           0.30   

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (225, 1)
              precision    recall  f1-score   support

           0       0.71      0.63      0.67        62
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         4
           3       0.78      0.67      0.72        67
           4       0.70      0.91      0.79        86

    accuracy                           0.72       225
   macro avg       0.44      0.44      0.43       225
weighted avg       0.69      0.72      0.70       225

[[39  0  0  8 15]
 [ 3  0  0  1  2]
 [ 1  0  0  0  3]
 [ 8  0  0 45 14]
 [ 4  0  0  4 78]]
naive model (only no)
              precision    recall  f1-score   support

           0       0.28      1.00      0.43        62
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         4
           3       0.00      0.00      0.00        67
           4       0.00      0.00      0.00        86

    accuracy                           0.28   

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (275, 1)
              precision    recall  f1-score   support

           0       0.69      0.68      0.68        78
           1       0.00      0.00      0.00         7
           2       0.00      0.00      0.00         4
           3       0.70      0.70      0.70        88
           4       0.76      0.86      0.81        98

    accuracy                           0.72       275
   macro avg       0.43      0.45      0.44       275
weighted avg       0.69      0.72      0.71       275

[[53  0  0 16  9]
 [ 2  0  0  4  1]
 [ 2  0  0  1  1]
 [11  0  0 62 15]
 [ 9  0  0  5 84]]
naive model (only no)
              precision    recall  f1-score   support

           0       0.28      1.00      0.44        78
           1       0.00      0.00      0.00         7
           2       0.00      0.00      0.00         4
           3       0.00      0.00      0.00        88
           4       0.00      0.00      0.00        98

    accuracy                           0.28   

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (325, 1)
              precision    recall  f1-score   support

           0       0.61      0.65      0.63        98
           1       0.00      0.00      0.00         7
           2       0.00      0.00      0.00         5
           3       0.63      0.62      0.63       104
           4       0.78      0.82      0.80       111

    accuracy                           0.68       325
   macro avg       0.40      0.42      0.41       325
weighted avg       0.65      0.68      0.66       325

[[64  0  0 25  9]
 [ 4  0  0  2  1]
 [ 3  0  0  0  2]
 [25  0  0 65 14]
 [ 9  0  0 11 91]]
naive model (only no)
              precision    recall  f1-score   support

           0       0.30      1.00      0.46        98
           1       0.00      0.00      0.00         7
           2       0.00      0.00      0.00         5
           3       0.00      0.00      0.00       104
           4       0.00      0.00      0.00       111

    accuracy                           0.30   

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (375, 1)
              precision    recall  f1-score   support

           0       0.61      0.57      0.59       115
           1       0.00      0.00      0.00         8
           2       0.00      0.00      0.00         6
           3       0.59      0.70      0.64       124
           4       0.74      0.73      0.74       122

    accuracy                           0.65       375
   macro avg       0.39      0.40      0.39       375
weighted avg       0.62      0.65      0.63       375

[[66  0  0 40  9]
 [ 5  0  0  3  0]
 [ 2  0  0  0  4]
 [19  0  0 87 18]
 [16  0  0 17 89]]
naive model (only no)
              precision    recall  f1-score   support

           0       0.31      1.00      0.47       115
           1       0.00      0.00      0.00         8
           2       0.00      0.00      0.00         6
           3       0.00      0.00      0.00       124
           4       0.00      0.00      0.00       122

    accuracy                           0.31   

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (425, 1)
              precision    recall  f1-score   support

           0       0.66      0.66      0.66       135
           1       0.00      0.00      0.00         8
           2       0.00      0.00      0.00         7
           3       0.67      0.70      0.69       142
           4       0.76      0.83      0.79       133

    accuracy                           0.70       425
   macro avg       0.42      0.44      0.43       425
weighted avg       0.68      0.70      0.69       425

[[ 89   0   0  31  15]
 [  4   0   0   3   1]
 [  3   0   0   1   3]
 [ 28   0   0  99  15]
 [ 10   0   0  13 110]]
naive model (only no)
              precision    recall  f1-score   support

           0       0.32      1.00      0.48       135
           1       0.00      0.00      0.00         8
           2       0.00      0.00      0.00         7
           3       0.00      0.00      0.00       142
           4       0.00      0.00      0.00       133

    accuracy         

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (475, 1)
              precision    recall  f1-score   support

           0       0.63      0.55      0.59       143
           1       0.00      0.00      0.00         9
           2       0.00      0.00      0.00         8
           3       0.60      0.75      0.67       169
           4       0.76      0.72      0.74       146

    accuracy                           0.65       475
   macro avg       0.40      0.40      0.40       475
weighted avg       0.64      0.65      0.64       475

[[ 79   0   0  51  13]
 [  6   0   0   2   1]
 [  4   0   0   2   2]
 [ 24   0   0 127  18]
 [ 12   0   0  29 105]]
naive model (only no)
              precision    recall  f1-score   support

           0       0.30      1.00      0.46       143
           1       0.00      0.00      0.00         9
           2       0.00      0.00      0.00         8
           3       0.00      0.00      0.00       169
           4       0.00      0.00      0.00       146

    accuracy         

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (525, 1)
              precision    recall  f1-score   support

           0       0.64      0.56      0.60       152
           1       0.00      0.00      0.00         9
           2       0.00      0.00      0.00         8
           3       0.64      0.77      0.70       198
           4       0.80      0.78      0.79       158

    accuracy                           0.69       525
   macro avg       0.42      0.42      0.42       525
weighted avg       0.67      0.69      0.67       525

[[ 85   0   0  54  13]
 [  3   0   0   5   1]
 [  1   0   0   3   4]
 [ 32   0   0 153  13]
 [ 11   0   0  24 123]]
naive model (only no)
              precision    recall  f1-score   support

           0       0.29      1.00      0.45       152
           1       0.00      0.00      0.00         9
           2       0.00      0.00      0.00         8
           3       0.00      0.00      0.00       198
           4       0.00      0.00      0.00       158

    accuracy         

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (575, 1)
              precision    recall  f1-score   support

           0       0.63      0.55      0.59       166
           1       0.00      0.00      0.00        10
           2       0.00      0.00      0.00         9
           3       0.64      0.78      0.71       222
           4       0.77      0.73      0.75       168

    accuracy                           0.67       575
   macro avg       0.41      0.41      0.41       575
weighted avg       0.65      0.67      0.66       575

[[ 91   0   0  59  16]
 [  2   0   0   5   3]
 [  5   0   0   2   2]
 [ 32   0   0 174  16]
 [ 15   0   0  31 122]]
naive model (only no)
              precision    recall  f1-score   support

           0       0.29      1.00      0.45       166
           1       0.00      0.00      0.00        10
           2       0.00      0.00      0.00         9
           3       0.00      0.00      0.00       222
           4       0.00      0.00      0.00       168

    accuracy         

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (625, 1)
              precision    recall  f1-score   support

           0       0.64      0.62      0.63       185
           1       0.00      0.00      0.00        10
           2       0.00      0.00      0.00         9
           3       0.66      0.73      0.69       241
           4       0.73      0.73      0.73       180

    accuracy                           0.67       625
   macro avg       0.41      0.41      0.41       625
weighted avg       0.65      0.67      0.66       625

[[114   0   0  56  15]
 [  3   0   0   4   3]
 [  3   0   0   3   3]
 [ 37   0   0 176  28]
 [ 21   0   0  28 131]]
naive model (only no)
              precision    recall  f1-score   support

           0       0.30      1.00      0.46       185
           1       0.00      0.00      0.00        10
           2       0.00      0.00      0.00         9
           3       0.00      0.00      0.00       241
           4       0.00      0.00      0.00       180

    accuracy         

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (675, 1)
              precision    recall  f1-score   support

           0       0.62      0.56      0.59       196
           1       0.00      0.00      0.00        11
           2       0.00      0.00      0.00        10
           3       0.63      0.76      0.69       266
           4       0.74      0.68      0.71       192

    accuracy                           0.66       675
   macro avg       0.40      0.40      0.40       675
weighted avg       0.64      0.66      0.64       675

[[109   0   0  68  19]
 [  2   0   0   7   2]
 [  4   0   0   4   2]
 [ 41   0   0 203  22]
 [ 19   0   0  42 131]]
naive model (only no)
              precision    recall  f1-score   support

           0       0.29      1.00      0.45       196
           1       0.00      0.00      0.00        11
           2       0.00      0.00      0.00        10
           3       0.00      0.00      0.00       266
           4       0.00      0.00      0.00       192

    accuracy         

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (725, 1)
              precision    recall  f1-score   support

           0       0.62      0.54      0.58       205
           1       0.00      0.00      0.00        11
           2       0.00      0.00      0.00        11
           3       0.64      0.80      0.71       296
           4       0.81      0.71      0.76       202

    accuracy                           0.68       725
   macro avg       0.42      0.41      0.41       725
weighted avg       0.66      0.68      0.67       725

[[111   0   0  81  13]
 [  7   0   0   3   1]
 [  4   0   0   5   2]
 [ 42   0   0 237  17]
 [ 15   0   0  43 144]]
naive model (only no)
              precision    recall  f1-score   support

           0       0.28      1.00      0.44       205
           1       0.00      0.00      0.00        11
           2       0.00      0.00      0.00        11
           3       0.00      0.00      0.00       296
           4       0.00      0.00      0.00       202

    accuracy         

  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
scores_tfidf

{300: [0.6, 0.6061148311474761],
 500: [0.656, 0.674991674991675],
 700: [0.6685714285714286, 0.6760675835185856],
 900: [0.72, 0.7317716701902748],
 1100: [0.7236363636363636, 0.7367264455974134],
 1300: [0.676923076923077, 0.6891769209840458],
 1500: [0.6453333333333333, 0.6576828870675016],
 1700: [0.7011764705882353, 0.7128044419597315],
 1900: [0.6547368421052632, 0.6656071515554839],
 2100: [0.6876190476190476, 0.697260132862925],
 2300: [0.6730434782608695, 0.6820294515320877],
 2500: [0.6736, 0.6834826288641748],
 2700: [0.6562962962962963, 0.6649992833199968],
 2900: [0.6786206896551724, 0.6870522964230481]}