In [1]:
import os, json, re
from pathlib import Path
import pandas as pd
import numpy as np
import glob
pd.set_option('display.max_columns', None)
import emoji

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV

from sklearn.metrics import classification_report as clsr
from sklearn.metrics import confusion_matrix as cm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [18]:
def strip_ASCII(text):
    return re.sub("([^\x00-\x7F])+"," ", text)

def prediction(X, y, n=0.2, cv=False, k=10, binary=True, label=[0,1,2,3,4,5,6]):

    labels = LabelEncoder()
    y = labels.fit_transform( np.asarray(y) )#.reshape(-1,1)
    names = labels.classes_
    print("shape of X:", X.shape)
    print("shape of y:", y.shape)
    
    X_train, X_test, y_train, y_test =tts(X, y, test_size=n, random_state=0, stratify=y, shuffle=True)
    print("shape of X_train:", X_train.shape)
    print("shape of y_train:", y_train.shape)
    print("shape of X_test:", X_test.shape)
    print("shape of y_test:", y_test.shape)
    
    if cv:
        lg = LogisticRegressionCV(cv=k, random_state=0, max_iter=1000)         
    else:
        lg = LogisticRegression()
        
    lg.fit(X_train, y_train)
    y_pred = lg.predict(X_test).reshape(-1,1)
    print("shape of y_pred:", y_pred.shape)
    print(clsr(y_test, y_pred, target_names=names))
    print(cm(y_test, y_pred))#, labels=label))
    acc = accuracy_score(y_test, y_pred)
    if not binary:
        f1 = f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
    else:
        f1 = f1_score(y_test, y_pred, labels=np.unique(y_pred))

    print('naive model (only no)')
    y_naive = np.array(['reporter']*len(y_test))
    y_naive = labels.fit_transform(y_naive)#.reshape(-1,1)
    print(clsr(y_test, y_naive, target_names=names))
    print(cm(y_test, y_naive, labels=label))

    return acc, f1

In [3]:
path_to_json = Path('C:\\Users\\niti.mishra\\Documents\\Personal\\cyberbullying\\data\\labelled_tweets')
json_pattern = os.path.join(path_to_json,'*.json')
file_list = glob.glob(json_pattern)
# file_list = file_list[:-2] 
file_list

['C:\\Users\\niti.mishra\\Documents\\Personal\\cyberbullying\\data\\labelled_tweets\\all_tweets.json']

In [4]:
tweets = pd.read_json(file_list[0],lines=True)
tweets.head()

Unnamed: 0,id,full_tweet,bullying_trace,bullying_role,form_of_bullying,bullying_post_type
0,1168174144724160518,"[@user, bullying, and, help, be, two, differen...",yes,bully,general,denial
1,1162980168232853504,"[@user, @user, i, mean, it, have, some, really...",no,,,
2,1161905202527526912,"[my, take, on, social, issue, like, this, be, ...",yes,reporter,general,report
3,1162217363023880193,"[i, want, to, watch, a, group, of, character, ...",no,,,
4,1168710966659272705,"[i, d, never, let, a, fucking, fetus, on, twit...",no,,,


In [5]:
target = 'bullying_role'
tweets = tweets[['id', 'full_tweet', target]]
tweets.dropna(inplace=True)

tweets['full_tweet'] = [ [strip_ASCII(token) for token in tweet] for tweet in tweets['full_tweet'] ]
tweets.head()

Unnamed: 0,id,full_tweet,bullying_role
0,1168174144724160518,"[@user, bullying, and, help, be, two, differen...",bully
2,1161905202527526912,"[my, take, on, social, issue, like, this, be, ...",reporter
8,1163814523683270656,"[my, sister, be, bully, this, girl, yang, baru...",bystander
11,1160014315828723713,"[@user, you, re, give, someone, point, for, de...",bully
14,1170474542415745024,"[if, i, say, something, about, you, i, will, s...",accuser


In [6]:
print(tweets.shape)
tweets[target].value_counts()

(3065, 3)


reporter      876
victim        832
defender      626
accuser       410
bully         161
bystander     124
reinforcer     35
assistant       1
Name: bullying_role, dtype: int64

In [7]:
tweets.loc[(tweets[target] == 'bystander'),target]='other'
tweets.loc[(tweets[target] == 'assistant'),target]='other'
tweets.loc[(tweets[target] == 'reinforcer'),target]='other'
tweets.head(10)

Unnamed: 0,id,full_tweet,bullying_role
0,1168174144724160518,"[@user, bullying, and, help, be, two, differen...",bully
2,1161905202527526912,"[my, take, on, social, issue, like, this, be, ...",reporter
8,1163814523683270656,"[my, sister, be, bully, this, girl, yang, baru...",other
11,1160014315828723713,"[@user, you, re, give, someone, point, for, de...",bully
14,1170474542415745024,"[if, i, say, something, about, you, i, will, s...",accuser
15,1166800355109289997,"[we, move, a, round, a, lot, grow, up, i, mean...",victim
17,1161705550855454720,"[kuhree, be, bully, me, on, snapchat, again, x...",victim
19,1160628746099023873,"[@user, you, be, so, disgusting, wtf, people, ...",defender
20,1165045015380406279,"[ally, be, it, ok, to, bully, u, me, sure, ten...",victim
25,1169119046903717888,"[@user, you, be, soo, beautiful, why, would, a...",defender


In [8]:
# tweets = tweets[tweets[target]!='assistant']
# # tweets = tweets[tweets[target]!='reinforcer']
# # tweets = tweets[tweets[target]!='bystander']
# # tweets = tweets[tweets[target]!='bully']
# print(tweets.shape)

In [9]:
# def strip_emoji(text):
# #     print(emoji.emoji_count(text))
#     new_text = re.sub(emoji.get_emoji_regexp(), r"", text)
#     return new_text


# tweets['full_tweet'] = [ [strip_emoji(token) for token in tweet] for tweet in tweets['full_tweet'] ]
# tweets.head()

In [10]:
# def strip_repeat(text):  
# #     return re.sub(r'(.)\1+', r'\1\1', text) 
#     return re.sub(r'(\w)\1+', r'\1', text)

# strip_repeat('heheehehe')
# # tweets['full_tweet'] = [ [strip_ASCII(token) for token in tweet] for tweet in tweets['full_tweet'] ]
# # tweets.head()

In [11]:
# tweets['len'] = tweets['full_tweet'].apply(len)
# tweets.head()

In [12]:
# tweets.groupby('bullying_trace')['len'].describe()

## Count Vectorizer ## 

In [13]:
cv = CountVectorizer()
cv_fit = cv.fit_transform([' '.join(tweet) for tweet in tweets['full_tweet'] ])

In [14]:
words = np.asarray(cv.get_feature_names())
count = np.asarray( cv_fit.toarray().sum(axis=0) )
corpusdictionary = dict(zip(words,count))

count = pd.DataFrame.from_dict(corpusdictionary, orient='index', columns=['count'])
count = count.sort_values(by=['count'], ascending=False)
count.to_csv('count_'+target+'.csv', index=True)
count

Unnamed: 0,count
be,4763
user,3634
to,2957
and,2936
bully,2903
...,...
imi,1
imessage,1
imbecile,1
imam,1


In [15]:
X = cv_fit
X.shape

y = tweets[target]
freq = y.value_counts()           # count frequency of different classes in loan status
freq/sum(freq)*100   

reporter    28.580750
victim      27.145188
defender    20.424144
accuser     13.376835
bully        5.252855
other        5.220228
Name: bullying_role, dtype: float64

In [16]:
acc, f1 = prediction(X, y, n=0.2, binary=False, label=[0,1,2,3,4,5])
print(acc, f1)

In [21]:
acc, f1 = prediction(X, y, n=0.2, cv=True, binary=False, label=[0,1,2,3,4,5])
print(acc, f1)

shape of X: (3065, 7214)
shape of y: (3065,)
shape of X_train: (2452, 7214)
shape of y_train: (2452,)
shape of X_test: (613, 7214)
shape of y_test: (613,)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

shape of y_pred: (613, 1)
              precision    recall  f1-score   support

     accuser       0.41      0.24      0.31        82
       bully       0.50      0.03      0.06        32
    defender       0.43      0.46      0.44       125
       other       0.00      0.00      0.00        32
    reporter       0.50      0.71      0.59       175
      victim       0.76      0.81      0.78       167

    accuracy                           0.55       613
   macro avg       0.43      0.38      0.36       613
weighted avg       0.52      0.55      0.51       613

[[ 20   0  22   0  32   8]
 [  3   1   8   0   9  11]
 [ 15   0  57   0  46   7]
 [  1   0   8   0  16   7]
 [  8   1  31   0 124  11]
 [  2   0   8   0  21 136]]
naive model (only no)
              precision    recall  f1-score   support

     accuser       0.13      1.00      0.24        82
       bully       0.00      0.00      0.00        32
    defender       0.00      0.00      0.00       125
       other       0.00      

  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
from collections import Counter
Counter(y)

Counter({'bully': 161,
         'reporter': 876,
         'other': 160,
         'accuser': 410,
         'victim': 832,
         'defender': 626})

In [20]:
# X = cv_fit
# X.shape

# y = tweets[target]
# freq = y.value_counts()           # count frequency of different classes in loan status
# freq/sum(freq)*100   

labels = LabelEncoder()
y = labels.fit_transform( np.asarray(y) )#.reshape(-1,1)
names = labels.classes_
print("shape of X:", X.shape)
print("shape of y:", y.shape)

X_train, X_test, y_train, y_test = tts(X, y, random_state=0, stratify=y, shuffle=True)
print("shape of X_train:", X_train.shape)
print("shape of y_train:", y_train.shape)
print("shape of X_test:", X_test.shape)
print("shape of y_test:", y_test.shape)

clf = LogisticRegressionCV(cv=k, random_state=0, max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test).reshape(-1,1)
print("shape of y_pred:", y_pred.shape)
print(clsr(y_test, y_pred, target_names=names))
print(cm(y_test, y_pred, labels=[0,1,2,3]))
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
# f1 = f1_score(y_test, y_pred, labels=np.unique(y_pred))
print(acc, f1)

shape of X: (3065, 7214)
shape of y: (3065,)
shape of X_train: (2298, 7214)
shape of y_train: (2298,)
shape of X_test: (767, 7214)
shape of y_test: (767,)


NameError: name 'k' is not defined

In [112]:
idx = range(100, X.shape[0], 300)
scores = { }
for i in idx: 
    print(i)
    acc, f1 = prediction(X[:i,], y[:i], n=0.2, cv=True, binary=False)
    scores[i] = [acc , f1]

100
shape of X: (100, 57557)
shape of y: (100,)
shape of X_train: (75, 57557)
shape of y_train: (75,)
shape of X_test: (25, 57557)
shape of y_test: (25,)


  n_classes = len(y_idx)


shape of y_pred: (25, 1)
              precision    recall  f1-score   support

     accuser       0.00      0.00      0.00         4
       bully       0.00      0.00      0.00         2
   bystander       0.00      0.00      0.00         2
    defender       0.00      0.00      0.00         4
  reinforcer       0.00      0.00      0.00         1
    reporter       0.19      1.00      0.32         4
      victim       1.00      0.50      0.67         8

    accuracy                           0.32        25
   macro avg       0.17      0.21      0.14        25
weighted avg       0.35      0.32      0.26        25

[[0 0 0 0 0 4 0]
 [0 0 0 0 0 2 0]
 [0 0 0 0 0 2 0]
 [0 0 0 0 0 4 0]
 [0 0 0 0 0 1 0]
 [0 0 0 0 0 4 0]
 [0 0 0 0 0 4 4]]
naive model (only no)
              precision    recall  f1-score   support

     accuser       0.16      1.00      0.28         4
       bully       0.00      0.00      0.00         2
   bystander       0.00      0.00      0.00         2
    defender       

  n_classes = len(y_idx)


shape of y_pred: (100, 1)
              precision    recall  f1-score   support

     accuser       0.25      1.00      0.40        18
       bully       0.00      0.00      0.00         8
   bystander       0.00      0.00      0.00         4
    defender       0.00      0.00      0.00        15
  reinforcer       0.00      0.00      0.00         2
    reporter       0.78      0.29      0.42        24
      victim       0.89      0.59      0.71        29

    accuracy                           0.42       100
   macro avg       0.27      0.27      0.22       100
weighted avg       0.49      0.42      0.38       100

[[18  0  0  0  0  0  0]
 [ 6  0  0  0  0  0  2]
 [ 4  0  0  0  0  0  0]
 [14  0  0  0  0  1  0]
 [ 2  0  0  0  0  0  0]
 [17  0  0  0  0  7  0]
 [11  0  0  0  0  1 17]]
naive model (only no)
              precision    recall  f1-score   support

     accuser       0.18      1.00      0.31        18
       bully       0.00      0.00      0.00         8
   bystander       0.00

  n_classes = len(y_idx)


shape of y_pred: (175, 1)
              precision    recall  f1-score   support

     accuser       0.25      0.59      0.35        22
       bully       0.00      0.00      0.00         9
   bystander       0.00      0.00      0.00         4
    defender       0.64      0.23      0.33        31
  reinforcer       0.00      0.00      0.00         2
    reporter       0.70      0.73      0.72        45
      victim       0.84      0.87      0.86        62

    accuracy                           0.61       175
   macro avg       0.35      0.35      0.32       175
weighted avg       0.62      0.61      0.59       175

[[13  0  0  2  0  5  2]
 [ 5  0  0  1  0  0  3]
 [ 1  0  0  0  0  2  1]
 [17  0  0  7  0  5  2]
 [ 1  0  0  0  0  1  0]
 [10  0  0  0  0 33  2]
 [ 6  0  0  1  0  1 54]]
naive model (only no)
              precision    recall  f1-score   support

     accuser       0.13      1.00      0.22        22
       bully       0.00      0.00      0.00         9
   bystander       0.00

  n_classes = len(y_idx)


shape of y_pred: (250, 1)
              precision    recall  f1-score   support

     accuser       0.29      0.28      0.29        32
       bully       0.00      0.00      0.00        11
   bystander       0.00      0.00      0.00         5
    defender       0.71      0.60      0.65        40
  reinforcer       0.00      0.00      0.00         2
    reporter       0.64      0.79      0.71        68
      victim       0.80      0.88      0.84        92

    accuracy                           0.67       250
   macro avg       0.35      0.37      0.35       250
weighted avg       0.62      0.67      0.64       250

[[ 9  0  0  5  0 13  5]
 [ 2  0  0  2  0  0  7]
 [ 2  0  0  0  0  2  1]
 [ 4  0  0 24  0 10  2]
 [ 1  0  0  1  0  0  0]
 [ 8  0  0  1  0 54  5]
 [ 5  0  0  1  0  5 81]]
naive model (only no)
              precision    recall  f1-score   support

     accuser       0.13      1.00      0.23        32
       bully       0.00      0.00      0.00        11
   bystander       0.00

  n_classes = len(y_idx)


shape of y_pred: (325, 1)
              precision    recall  f1-score   support

     accuser       0.30      0.51      0.38        51
       bully       0.00      0.00      0.00        16
   bystander       0.00      0.00      0.00         5
    defender       0.56      0.20      0.29        46
  reinforcer       0.00      0.00      0.00         3
    reporter       0.64      0.71      0.67        96
      victim       0.81      0.85      0.83       108

    accuracy                           0.60       325
   macro avg       0.33      0.32      0.31       325
weighted avg       0.59      0.60      0.58       325

[[26  1  0  4  0 16  4]
 [ 5  0  0  0  0  3  8]
 [ 2  0  0  0  0  3  0]
 [25  0  0  9  0 11  1]
 [ 2  0  0  0  0  0  1]
 [18  1  0  2  0 68  7]
 [ 8  2  0  1  0  5 92]]
naive model (only no)
              precision    recall  f1-score   support

     accuser       0.16      1.00      0.27        51
       bully       0.00      0.00      0.00        16
   bystander       0.00



shape of y_pred: (400, 1)
              precision    recall  f1-score   support

     accuser       0.35      0.41      0.38        64
       bully       0.00      0.00      0.00        20
   bystander       0.00      0.00      0.00         7
    defender       0.56      0.27      0.36        56
  reinforcer       0.00      0.00      0.00         3
    reporter       0.61      0.85      0.71       124
      victim       0.86      0.86      0.86       126

    accuracy                           0.64       400
   macro avg       0.34      0.34      0.33       400
weighted avg       0.59      0.64      0.60       400

[[ 26   0   0   5   0  30   3]
 [  6   0   0   1   0   6   7]
 [  0   0   0   1   0   4   2]
 [ 23   0   0  15   0  16   2]
 [  1   0   0   1   0   1   0]
 [ 13   0   0   2   0 105   4]
 [  5   0   0   2   0  11 108]]
naive model (only no)
              precision    recall  f1-score   support

     accuser       0.16      1.00      0.28        64
       bully       0.00     



shape of y_pred: (550, 1)
              precision    recall  f1-score   support

     accuser       0.20      0.49      0.29        77
       bully       0.00      0.00      0.00        28
   bystander       0.00      0.00      0.00        16
    defender       0.59      0.45      0.51       105
  reinforcer       0.00      0.00      0.00         6
    reporter       0.57      0.48      0.52       157
      victim       0.85      0.80      0.82       161

    accuracy                           0.52       550
   macro avg       0.32      0.32      0.31       550
weighted avg       0.55      0.52      0.53       550

[[ 38   0   0  17   0  20   2]
 [ 16   0   0   2   0   2   8]
 [  7   0   0   2   0   4   3]
 [ 35   0   0  47   0  21   2]
 [  2   0   0   1   0   3   0]
 [ 66   0   0   8   0  75   8]
 [ 24   0   0   2   0   7 128]]
naive model (only no)
              precision    recall  f1-score   support

     accuser       0.14      1.00      0.25        77
       bully       0.00     



shape of y_pred: (625, 1)
              precision    recall  f1-score   support

     accuser       0.34      0.15      0.21        95
       bully       0.75      0.10      0.17        31
   bystander       0.00      0.00      0.00        17
    defender       0.51      0.41      0.46       114
  reinforcer       0.00      0.00      0.00         7
    reporter       0.51      0.87      0.64       182
      victim       0.83      0.82      0.82       179

    accuracy                           0.59       625
   macro avg       0.42      0.34      0.33       625
weighted avg       0.57      0.59      0.55       625

[[ 14   1   0  22   0  56   2]
 [  2   3   0   5   0  11  10]
 [  2   0   0   1   0  13   1]
 [ 15   0   0  47   0  46   6]
 [  0   0   0   3   0   3   1]
 [  4   0   0   9   0 159  10]
 [  4   0   0   5   0  24 146]]
naive model (only no)
              precision    recall  f1-score   support

     accuser       0.15      1.00      0.26        95
       bully       0.00     



shape of y_pred: (700, 1)
              precision    recall  f1-score   support

     accuser       0.28      0.41      0.33       101
       bully       1.00      0.08      0.15        37
   bystander       0.00      0.00      0.00        23
    defender       0.44      0.41      0.42       135
  reinforcer       0.00      0.00      0.00         7
    reporter       0.58      0.70      0.64       203
      victim       0.83      0.78      0.81       194

    accuracy                           0.56       700
   macro avg       0.45      0.34      0.34       700
weighted avg       0.58      0.56      0.55       700

[[ 41   0   0  22   0  34   4]
 [ 11   3   0   6   0   8   9]
 [  3   0   0   2   0  14   4]
 [ 45   0   0  55   0  31   4]
 [  1   0   0   3   0   3   0]
 [ 26   0   0  25   0 142  10]
 [ 20   0   0  11   0  11 152]]
naive model (only no)
              precision    recall  f1-score   support

     accuser       0.14      1.00      0.25       101
       bully       0.00     



In [117]:
scores

{100: [0.32, 0.5511111111111111],
 400: [0.42, 0.5341335894152796],
 700: [0.6114285714285714, 0.6461591614906832],
 1000: [0.672, 0.692359441767019],
 1300: [0.6, 0.5907404364220498],
 1600: [0.635, 0.6487396582734456],
 1900: [0.6231578947368421, 0.6148312861089076],
 2200: [0.5236363636363637, 0.5786295531833423],
 2500: [0.5904, 0.5678606061117623],
 2800: [0.5614285714285714, 0.5701203819102401]}

## TDIDF Vectorizer ## 

In [21]:
def identity(words):
    return words
vectorizer = TfidfVectorizer(tokenizer=identity, encoding='utf-8', preprocessor=None, use_idf=True,
                             lowercase=False, ngram_range=(1,2))
#                              , stop_words='english')#,
#                              min_df=5, max_df=0.7)

In [22]:
tfidf_fit = vectorizer.fit_transform([tweet for tweet in tweets['full_tweet']])
X = tfidf_fit
X.shape

(3065, 57558)

In [23]:
df_idf = pd.DataFrame(vectorizer.idf_, index=vectorizer.get_feature_names(), columns=['idf_weights'])
weights = df_idf.sort_values(by=['idf_weights'], ascending=False)
weights.to_csv('tdidfweights_'+target+'.csv', index=True)
# weights.to_csv('tdidfweights_noemoji.csv', index=True)
weights
# the lower the idf value of a word, the less unique it is to any particular document
# terms with higher weight scores are considered to be more important

Unnamed: 0,idf_weights
lie cyberbullying,8.334982
one with,8.334982
one target,8.334982
one there,8.334982
one travelling,8.334982
...,...
and,1.543760
to,1.521537
@user,1.499797
be,1.277945


In [24]:
# tdidf score of first tweet
# if a word occurs multiple times in a document, we should boost its relevance as it should be 
# more meaningful than other words that appear fewer times (TF)
# On the other hand, if a word occurs many times in all documents, maybe it is just a frequent word
vector = pd.DataFrame(X[1].T.todense(), index=vectorizer.get_feature_names(), columns=['tdidf'])
vector.sort_values(by=['tdidf'], ascending=False)

Unnamed: 0,tdidf
suck,0.301049
everything suck,0.283884
everything,0.187387
because,0.157895
suck victim,0.141942
...,...
from be,0.000000
from behind,0.000000
from bernie,0.000000
from boy,0.000000


In [28]:
D = vectorizer.vocabulary_
max_word = max(D, key=D.get)
max_value = max(D.values())
print(max_word, max_value)

~ you 57557


In [33]:
tfidf_fit = vectorizer.fit_transform([tweet for tweet in tweets['full_tweet']])
X = tfidf_fit
X.shape

y = tweets[target]
freq = y.value_counts()           # count frequency of different classes in loan status
freq/sum(freq)*100   

reporter    28.580750
victim      27.145188
defender    20.424144
accuser     13.376835
bully        5.252855
other        5.220228
Name: bullying_role, dtype: float64

In [42]:
tfidf_fit = vectorizer.fit_transform([tweet for tweet in tweets['full_tweet']])
X = tfidf_fit
X.shape

y = tweets[target]
freq = y.value_counts()           # count frequency of different classes in loan status
freq/sum(freq)*100   

labels = LabelEncoder()
y = labels.fit_transform( np.asarray(y) )#.reshape(-1,1)
names = labels.classes_
print("shape of X:", X.shape)
print("shape of y:", y.shape)

X_train, X_test, y_train, y_test = tts(X, y, random_state=0, stratify=y, shuffle=True)
print("shape of X_train:", X_train.shape)
print("shape of y_train:", y_train.shape)
print("shape of X_test:", X_test.shape)
print("shape of y_test:", y_test.shape)

from sklearn import svm
from sklearn.model_selection import GridSearchCV

def svc_param_selection(X, y, nfolds):
    Cs = [0.001, 0.01, 0.1, 1, 10]
    gammas = [0.001, 0.01, 0.1, 1]
    param_grid = {'C': Cs, 'gamma' : gammas}
    grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    
    return grid_search.best_params_

param_dict = svc_param_selection(X, y,nfolds=10)

clf = svm.SVC(kernel='rbf', C=param_dict['C'], gamma=param_dict['gamma'])
clf.fit(X_train, y_train)

shape of X: (3065, 57558)
shape of y: (3065,)
shape of X_train: (2298, 57558)
shape of y_train: (2298,)
shape of X_test: (767, 57558)
shape of y_test: (767,)


SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [43]:
y_pred = clf.predict(X_test).reshape(-1,1)
print("shape of y_pred:", y_pred.shape)
print(clsr(y_test, y_pred, target_names=names))
print(cm(y_test, y_pred, labels=[0,1,2,3]))
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
# f1 = f1_score(y_test, y_pred, labels=np.unique(y_pred))
print(acc, f1)

print('naive model (only no)')
y_naive = np.array(['general']*len(y_test))
y_naive = labels.fit_transform(y_naive)#.reshape(-1,1)
print(clsr(y_test, y_naive, target_names=names))
print(cm(y_test, y_naive, labels=[1,0]))

shape of y_pred: (767, 1)
              precision    recall  f1-score   support

     accuser       0.36      0.17      0.24       103
       bully       0.83      0.12      0.22        40
    defender       0.43      0.46      0.44       157
       other       0.00      0.00      0.00        40
    reporter       0.51      0.74      0.61       219
      victim       0.77      0.84      0.80       208

    accuracy                           0.56       767
   macro avg       0.48      0.39      0.38       767
weighted avg       0.53      0.56      0.52       767

[[18  0 35  0]
 [ 1  5 10  0]
 [18  0 72  0]
 [ 2  0  7  0]]
0.5632333767926988 0.553663501893475
naive model (only no)
              precision    recall  f1-score   support

     accuser       0.13      1.00      0.24       103
       bully       0.00      0.00      0.00        40
    defender       0.00      0.00      0.00       157
       other       0.00      0.00      0.00        40
    reporter       0.00      0.00      0

  _warn_prf(average, modifier, msg_start, len(result))


In [40]:
acc, f1 = prediction(X, y, n=0.2, binary=False)
print(acc, f1)

shape of X: (3065, 57558)
shape of y: (3065,)
shape of X_train: (2298, 57558)
shape of y_train: (2298,)
shape of X_test: (767, 57558)
shape of y_test: (767,)
shape of y_pred: (767, 1)
              precision    recall  f1-score   support

     accuser       0.67      0.06      0.11       103
       bully       0.00      0.00      0.00        40
    defender       0.52      0.49      0.50       157
       other       0.00      0.00      0.00        40
    reporter       0.49      0.81      0.61       219
      victim       0.73      0.86      0.79       208

    accuracy                           0.57       767
   macro avg       0.40      0.37      0.33       767
weighted avg       0.53      0.57      0.51       767

[[  6   0  34   0  58   5]
 [  0   0   7   0  11  22]
 [  1   0  77   0  66  13]
 [  0   0   2   0  30   8]
 [  1   0  22   0 178  18]
 [  1   0   7   0  22 178]]
naive model (only no)
              precision    recall  f1-score   support

     accuser       0.13      1.00

  _warn_prf(average, modifier, msg_start, len(result))


In [41]:
acc, f1 = prediction(X, y, n=0.2, cv=True, binary=False)
print(acc, f1)

shape of X: (3065, 57558)
shape of y: (3065,)
shape of X_train: (2298, 57558)
shape of y_train: (2298,)
shape of X_test: (767, 57558)
shape of y_test: (767,)
shape of y_pred: (767, 1)
              precision    recall  f1-score   support

     accuser       0.32      0.16      0.21       103
       bully       0.86      0.15      0.26        40
    defender       0.42      0.48      0.45       157
       other       0.00      0.00      0.00        40
    reporter       0.54      0.73      0.62       219
      victim       0.76      0.84      0.80       208

    accuracy                           0.56       767
   macro avg       0.48      0.39      0.39       767
weighted avg       0.53      0.56      0.53       767

[[ 16   0  40   0  43   4]
 [  1   6  10   0   6  17]
 [ 21   0  75   0  50  11]
 [  3   0   7   0  21   9]
 [  5   1  38   0 160  15]
 [  4   0  10   0  19 175]]
naive model (only no)
              precision    recall  f1-score   support

     accuser       0.13      1.00

  _warn_prf(average, modifier, msg_start, len(result))


In [128]:
idx = range(100, X.shape[0], 300)
scores_tfidf = { }
for i in idx: 
    print(i)
    acc, f1 = prediction(X[:i,], y[:i], n=0.2, cv=True, binary=False)
    scores_tfidf[i] = [acc , f1]

100
shape of X: (100, 57557)
shape of y: (100,)
shape of X_train: (75, 57557)
shape of y_train: (75,)
shape of X_test: (25, 57557)
shape of y_test: (25,)


  n_classes = len(y_idx)


shape of y_pred: (25, 1)
              precision    recall  f1-score   support

     accuser       0.00      0.00      0.00         4
       bully       0.00      0.00      0.00         2
   bystander       0.00      0.00      0.00         2
    defender       0.00      0.00      0.00         4
  reinforcer       0.00      0.00      0.00         1
    reporter       0.19      1.00      0.32         4
      victim       1.00      0.50      0.67         8

    accuracy                           0.32        25
   macro avg       0.17      0.21      0.14        25
weighted avg       0.35      0.32      0.26        25

[[0 0 0 0 0 4 0]
 [0 0 0 0 0 2 0]
 [0 0 0 0 0 2 0]
 [0 0 0 0 0 4 0]
 [0 0 0 0 0 1 0]
 [0 0 0 0 0 4 0]
 [0 0 0 0 0 4 4]]
naive model (only no)
              precision    recall  f1-score   support

     accuser       0.16      1.00      0.28         4
       bully       0.00      0.00      0.00         2
   bystander       0.00      0.00      0.00         2
    defender       

  n_classes = len(y_idx)


shape of y_pred: (100, 1)
              precision    recall  f1-score   support

     accuser       0.25      1.00      0.40        18
       bully       0.00      0.00      0.00         8
   bystander       0.00      0.00      0.00         4
    defender       0.00      0.00      0.00        15
  reinforcer       0.00      0.00      0.00         2
    reporter       0.78      0.29      0.42        24
      victim       0.89      0.59      0.71        29

    accuracy                           0.42       100
   macro avg       0.27      0.27      0.22       100
weighted avg       0.49      0.42      0.38       100

[[18  0  0  0  0  0  0]
 [ 6  0  0  0  0  0  2]
 [ 4  0  0  0  0  0  0]
 [14  0  0  0  0  1  0]
 [ 2  0  0  0  0  0  0]
 [17  0  0  0  0  7  0]
 [11  0  0  0  0  1 17]]
naive model (only no)
              precision    recall  f1-score   support

     accuser       0.18      1.00      0.31        18
       bully       0.00      0.00      0.00         8
   bystander       0.00

  n_classes = len(y_idx)


shape of y_pred: (175, 1)
              precision    recall  f1-score   support

     accuser       0.25      0.59      0.35        22
       bully       0.00      0.00      0.00         9
   bystander       0.00      0.00      0.00         4
    defender       0.64      0.23      0.33        31
  reinforcer       0.00      0.00      0.00         2
    reporter       0.70      0.73      0.72        45
      victim       0.84      0.87      0.86        62

    accuracy                           0.61       175
   macro avg       0.35      0.35      0.32       175
weighted avg       0.62      0.61      0.59       175

[[13  0  0  2  0  5  2]
 [ 5  0  0  1  0  0  3]
 [ 1  0  0  0  0  2  1]
 [17  0  0  7  0  5  2]
 [ 1  0  0  0  0  1  0]
 [10  0  0  0  0 33  2]
 [ 6  0  0  1  0  1 54]]
naive model (only no)
              precision    recall  f1-score   support

     accuser       0.13      1.00      0.22        22
       bully       0.00      0.00      0.00         9
   bystander       0.00

  n_classes = len(y_idx)


shape of y_pred: (250, 1)
              precision    recall  f1-score   support

     accuser       0.29      0.28      0.29        32
       bully       0.00      0.00      0.00        11
   bystander       0.00      0.00      0.00         5
    defender       0.71      0.60      0.65        40
  reinforcer       0.00      0.00      0.00         2
    reporter       0.64      0.79      0.71        68
      victim       0.80      0.88      0.84        92

    accuracy                           0.67       250
   macro avg       0.35      0.37      0.35       250
weighted avg       0.62      0.67      0.64       250

[[ 9  0  0  5  0 13  5]
 [ 2  0  0  2  0  0  7]
 [ 2  0  0  0  0  2  1]
 [ 4  0  0 24  0 10  2]
 [ 1  0  0  1  0  0  0]
 [ 8  0  0  1  0 54  5]
 [ 5  0  0  1  0  5 81]]
naive model (only no)
              precision    recall  f1-score   support

     accuser       0.13      1.00      0.23        32
       bully       0.00      0.00      0.00        11
   bystander       0.00

  n_classes = len(y_idx)


shape of y_pred: (325, 1)
              precision    recall  f1-score   support

     accuser       0.30      0.51      0.38        51
       bully       0.00      0.00      0.00        16
   bystander       0.00      0.00      0.00         5
    defender       0.56      0.20      0.29        46
  reinforcer       0.00      0.00      0.00         3
    reporter       0.64      0.71      0.67        96
      victim       0.81      0.85      0.83       108

    accuracy                           0.60       325
   macro avg       0.33      0.32      0.31       325
weighted avg       0.59      0.60      0.58       325

[[26  1  0  4  0 16  4]
 [ 5  0  0  0  0  3  8]
 [ 2  0  0  0  0  3  0]
 [25  0  0  9  0 11  1]
 [ 2  0  0  0  0  0  1]
 [18  1  0  2  0 68  7]
 [ 8  2  0  1  0  5 92]]
naive model (only no)
              precision    recall  f1-score   support

     accuser       0.16      1.00      0.27        51
       bully       0.00      0.00      0.00        16
   bystander       0.00



shape of y_pred: (400, 1)
              precision    recall  f1-score   support

     accuser       0.35      0.41      0.38        64
       bully       0.00      0.00      0.00        20
   bystander       0.00      0.00      0.00         7
    defender       0.56      0.27      0.36        56
  reinforcer       0.00      0.00      0.00         3
    reporter       0.61      0.85      0.71       124
      victim       0.86      0.86      0.86       126

    accuracy                           0.64       400
   macro avg       0.34      0.34      0.33       400
weighted avg       0.59      0.64      0.60       400

[[ 26   0   0   5   0  30   3]
 [  6   0   0   1   0   6   7]
 [  0   0   0   1   0   4   2]
 [ 23   0   0  15   0  16   2]
 [  1   0   0   1   0   1   0]
 [ 13   0   0   2   0 105   4]
 [  5   0   0   2   0  11 108]]
naive model (only no)
              precision    recall  f1-score   support

     accuser       0.16      1.00      0.28        64
       bully       0.00     



shape of y_pred: (475, 1)
              precision    recall  f1-score   support

     accuser       0.39      0.41      0.40        74
       bully       0.50      0.04      0.08        24
   bystander       0.00      0.00      0.00        10
    defender       0.47      0.44      0.45        77
  reinforcer       0.00      0.00      0.00         4
    reporter       0.64      0.75      0.69       139
      victim       0.79      0.86      0.82       147

    accuracy                           0.62       475
   macro avg       0.40      0.36      0.35       475
weighted avg       0.59      0.62      0.60       475

[[ 30   0   0  16   0  20   8]
 [  2   1   0   5   0   4  12]
 [  1   0   0   1   0   6   2]
 [ 22   0   0  34   0  17   4]
 [  1   0   0   0   0   3   0]
 [ 16   0   0  11   0 104   8]
 [  5   1   0   6   0   8 127]]
naive model (only no)
              precision    recall  f1-score   support

     accuser       0.16      1.00      0.27        74
       bully       0.00     



shape of y_pred: (550, 1)
              precision    recall  f1-score   support

     accuser       0.20      0.49      0.29        77
       bully       0.00      0.00      0.00        28
   bystander       0.00      0.00      0.00        16
    defender       0.59      0.45      0.51       105
  reinforcer       0.00      0.00      0.00         6
    reporter       0.57      0.48      0.52       157
      victim       0.85      0.80      0.82       161

    accuracy                           0.52       550
   macro avg       0.32      0.32      0.31       550
weighted avg       0.55      0.52      0.53       550

[[ 38   0   0  17   0  20   2]
 [ 16   0   0   2   0   2   8]
 [  7   0   0   2   0   4   3]
 [ 35   0   0  47   0  21   2]
 [  2   0   0   1   0   3   0]
 [ 66   0   0   8   0  75   8]
 [ 24   0   0   2   0   7 128]]
naive model (only no)
              precision    recall  f1-score   support

     accuser       0.14      1.00      0.25        77
       bully       0.00     



shape of y_pred: (625, 1)
              precision    recall  f1-score   support

     accuser       0.34      0.15      0.21        95
       bully       0.75      0.10      0.17        31
   bystander       0.00      0.00      0.00        17
    defender       0.51      0.41      0.46       114
  reinforcer       0.00      0.00      0.00         7
    reporter       0.51      0.87      0.64       182
      victim       0.83      0.82      0.82       179

    accuracy                           0.59       625
   macro avg       0.42      0.34      0.33       625
weighted avg       0.57      0.59      0.55       625

[[ 14   1   0  22   0  56   2]
 [  2   3   0   5   0  11  10]
 [  2   0   0   1   0  13   1]
 [ 15   0   0  47   0  46   6]
 [  0   0   0   3   0   3   1]
 [  4   0   0   9   0 159  10]
 [  4   0   0   5   0  24 146]]
naive model (only no)
              precision    recall  f1-score   support

     accuser       0.15      1.00      0.26        95
       bully       0.00     



shape of y_pred: (700, 1)
              precision    recall  f1-score   support

     accuser       0.28      0.41      0.33       101
       bully       1.00      0.08      0.15        37
   bystander       0.00      0.00      0.00        23
    defender       0.44      0.41      0.42       135
  reinforcer       0.00      0.00      0.00         7
    reporter       0.58      0.70      0.64       203
      victim       0.83      0.78      0.81       194

    accuracy                           0.56       700
   macro avg       0.45      0.34      0.34       700
weighted avg       0.58      0.56      0.55       700

[[ 41   0   0  22   0  34   4]
 [ 11   3   0   6   0   8   9]
 [  3   0   0   2   0  14   4]
 [ 45   0   0  55   0  31   4]
 [  1   0   0   3   0   3   0]
 [ 26   0   0  25   0 142  10]
 [ 20   0   0  11   0  11 152]]
naive model (only no)
              precision    recall  f1-score   support

     accuser       0.14      1.00      0.25       101
       bully       0.00     



In [129]:
scores_tfidf

{100: [0.32, 0.5511111111111111],
 400: [0.42, 0.5341335894152796],
 700: [0.6114285714285714, 0.6461591614906832],
 1000: [0.672, 0.692359441767019],
 1300: [0.6, 0.5907404364220498],
 1600: [0.635, 0.6487396582734456],
 1900: [0.6231578947368421, 0.6148312861089076],
 2200: [0.5236363636363637, 0.5786295531833423],
 2500: [0.5904, 0.5678606061117623],
 2800: [0.5614285714285714, 0.5701203819102401]}

In [260]:
labels = LabelEncoder()
y = labels.fit_transform( np.asarray(y) )#.reshape(-1,1)
names = labels.classes_
print("shape of X:", X.shape)
print("shape of y:", y.shape)

X_train, X_test, y_train, y_test =tts(X, y, random_state=0, stratify=y, shuffle=True)
print("shape of X_train:", X_train.shape)
print("shape of y_train:", y_train.shape)
print("shape of X_test:", X_test.shape)
print("shape of y_test:", y_test.shape)

scores_copy = { }
idx = range(200, X_train.shape[0], 200)
for i in idx:
    lg = LogisticRegressionCV(cv=10, random_state=0, max_iter=1000)         
    lg.fit(X_train[:i,], y_train[:i])
    y_pred = lg.predict(X_test)#.reshape(-1,1)
    print("shape of y_pred:", y_pred.shape)
    print(clsr(y_test, y_pred))#, target_names=names))
    print(cm(y_test, y_pred))#, labels=names))
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
    scores_copy[i] = [acc , f1]

scores_copy

shape of X: (3065, 7214)
shape of y: (3065,)
shape of X_train: (2298, 7214)
shape of y_train: (2298,)
shape of X_test: (767, 7214)
shape of y_test: (767,)




shape of y_pred: (767,)
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       103
           1       0.00      0.00      0.00        40
           2       0.00      0.00      0.00       157
           3       0.00      0.00      0.00        40
           4       0.35      0.83      0.49       219
           5       0.67      0.79      0.73       208

    accuracy                           0.45       767
   macro avg       0.17      0.27      0.20       767
weighted avg       0.28      0.45      0.34       767





ValueError: At least one label specified must be in y_true

In [None]:
scores_copy