In [1]:
import os, json, re
from pathlib import Path
import pandas as pd
import numpy as np
import glob
pd.set_option('display.max_columns', None)
import emoji

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn import svm

from sklearn.metrics import classification_report as clsr
from sklearn.metrics import confusion_matrix as cm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
def strip_ASCII(text):
    return re.sub("([^\x00-\x7F])+"," ", text)

In [3]:
path_to_json = Path('C:\\Users\\niti.mishra\\Documents\\Personal\\cyberbullying\\data\\labelled_tweets')
json_pattern = os.path.join(path_to_json,'*.json')
file_list = glob.glob(json_pattern)
# file_list = file_list[:-2] 
tweets = pd.read_json(file_list[0],lines=True)
tweets.head()

Unnamed: 0,id,full_tweet,bullying_trace,bullying_role,form_of_bullying,bullying_post_type
0,1168174144724160518,"[@user, bullying, and, help, be, two, differen...",yes,bully,general,denial
1,1162980168232853504,"[@user, @user, i, mean, it, have, some, really...",no,,,
2,1161905202527526912,"[my, take, on, social, issue, like, this, be, ...",yes,reporter,general,report
3,1162217363023880193,"[i, want, to, watch, a, group, of, character, ...",no,,,
4,1168710966659272705,"[i, d, never, let, a, fucking, fetus, on, twit...",no,,,


In [4]:
target = 'form_of_bullying'
tweets = tweets[['id', 'full_tweet', target]]
tweets.dropna(inplace=True)
print(tweets.shape)
print(tweets[target].value_counts())
tweets['full_tweet'] = [ [strip_ASCII(token) for token in tweet] for tweet in tweets['full_tweet'] ]
tweets.head()

(3089, 3)
general     1830
cyber       1059
verbal       106
physical      94
Name: form_of_bullying, dtype: int64


Unnamed: 0,id,full_tweet,form_of_bullying
0,1168174144724160518,"[@user, bullying, and, help, be, two, differen...",general
2,1161905202527526912,"[my, take, on, social, issue, like, this, be, ...",general
8,1163814523683270656,"[my, sister, be, bully, this, girl, yang, baru...",cyber
11,1160014315828723713,"[@user, you, re, give, someone, point, for, de...",cyber
14,1170474542415745024,"[if, i, say, something, about, you, i, will, s...",cyber


In [8]:
# def strip_emoji(text):
# #     print(emoji.emoji_count(text))
#     new_text = re.sub(emoji.get_emoji_regexp(), r"", text)
#     return new_text


# tweets['full_tweet'] = [ [strip_emoji(token) for token in tweet] for tweet in tweets['full_tweet'] ]
# tweets.head()

In [9]:
# def strip_repeat(text):  
# #     return re.sub(r'(.)\1+', r'\1\1', text) 
#     return re.sub(r'(\w)\1+', r'\1', text)

# strip_repeat('heheehehe')
# # tweets['full_tweet'] = [ [strip_ASCII(token) for token in tweet] for tweet in tweets['full_tweet'] ]
# # tweets.head()

In [10]:
# tweets['len'] = tweets['full_tweet'].apply(len)
# tweets.head()

In [11]:
# tweets.groupby('bullying_trace')['len'].describe()

## Count Vectorizer ## 

In [19]:
cv = CountVectorizer()
cv_fit = cv.fit_transform([' '.join(tweet) for tweet in tweets['full_tweet'] ])

In [20]:
words = np.asarray(cv.get_feature_names())
count = np.asarray( cv_fit.toarray().sum(axis=0) )
corpusdictionary = dict(zip(words,count))

count = pd.DataFrame.from_dict(corpusdictionary, orient='index', columns=['count'])
count = count.sort_values(by=['count'], ascending=False)
count.to_csv('count_'+target+'.csv', index=True)
count

Unnamed: 0,count
be,4800
user,3662
to,2985
and,2956
bully,2930
...,...
imitate,1
imi,1
imessage,1
imbecile,1


In [21]:
X = cv_fit
X.shape

y = tweets[target]
freq = y.value_counts()           # count frequency of different classes in loan status
freq/sum(freq)*100   

general     59.242473
cyber       34.282939
verbal       3.431531
physical     3.043056
Name: form_of_bullying, dtype: float64

In [12]:
from sklearn import svm

In [17]:
def prediction(X, y, n=0.2, cv=False, k=10, binary=True):

    labels = LabelEncoder()
    y = labels.fit_transform( np.asarray(y) )#.reshape(-1,1)
    names = labels.classes_
    print("shape of X:", X.shape)
    print("shape of y:", y.shape)
    
    X_train, X_test, y_train, y_test = tts(X, y, random_state=0, stratify=y, shuffle=True)
    print("shape of X_train:", X_train.shape)
    print("shape of y_train:", y_train.shape)
    print("shape of X_test:", X_test.shape)
    print("shape of y_test:", y_test.shape)
    
#     if cv:
#         clf = LogisticRegressionCV(cv=k, random_state=0, max_iter=1000)         
#     else:
#         clf = LogisticRegression()

    clf = svm.SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
              decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf', max_iter=-1,
              probability=False, random_state=None, shrinking=True, tol=0.001,
              verbose=False)
        
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)#.reshape(-1,1)
    print("shape of y_pred:", y_pred.shape)
    print(clsr(y_test, y_pred))#, target_names=names))
    print(cm(y_test, y_pred))#, labels=[0,1,2,3]))
    acc = accuracy_score(y_test, y_pred)
    if not binary:
        f1 = f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
    else:
        f1 = f1_score(y_test, y_pred, labels=np.unique(y_pred))

    print('naive model (only no)')
    y_naive = np.array(['general']*len(y_test))
    y_naive = labels.fit_transform(y_naive)#.reshape(-1,1)
    print(clsr(y_test, y_naive))#, target_names=names))
    print(cm(y_test, y_naive, labels=[1,0]))

    return acc, f1

In [74]:
acc, f1 = prediction(X, y, n=0.2, binary=False)
print(acc, f1)

shape of X: (3089, 7239)
shape of y: (3089,)
shape of X_train: (2316, 7239)
shape of y_train: (2316,)
shape of X_test: (773, 7239)
shape of y_test: (773,)
shape of y_pred: (773, 1)
              precision    recall  f1-score   support

       cyber       0.70      0.75      0.72       265
     general       0.77      0.79      0.78       458
    physical       0.36      0.22      0.27        23
      verbal       0.33      0.07      0.12        27

    accuracy                           0.74       773
   macro avg       0.54      0.46      0.47       773
weighted avg       0.72      0.74      0.72       773

[[198  65   2   0]
 [ 83 364   7   4]
 [  0  18   5   0]
 [  1  24   0   2]]
naive model (only no)
              precision    recall  f1-score   support

       cyber       0.34      1.00      0.51       265
     general       0.00      0.00      0.00       458
    physical       0.00      0.00      0.00        23
      verbal       0.00      0.00      0.00        27

    accuracy 

  _warn_prf(average, modifier, msg_start, len(result))


In [75]:
acc, f1 = prediction(X, y, n=0.2, cv=True, binary=False)
print(acc, f1)

shape of X: (3089, 7239)
shape of y: (3089,)
shape of X_train: (2316, 7239)
shape of y_train: (2316,)
shape of X_test: (773, 7239)
shape of y_test: (773,)
shape of y_pred: (773, 1)
              precision    recall  f1-score   support

       cyber       0.70      0.75      0.72       265
     general       0.77      0.79      0.78       458
    physical       0.36      0.22      0.27        23
      verbal       0.33      0.07      0.12        27

    accuracy                           0.74       773
   macro avg       0.54      0.46      0.47       773
weighted avg       0.72      0.74      0.72       773

[[198  65   2   0]
 [ 83 364   7   4]
 [  0  18   5   0]
 [  1  24   0   2]]
naive model (only no)
              precision    recall  f1-score   support

       cyber       0.34      1.00      0.51       265
     general       0.00      0.00      0.00       458
    physical       0.00      0.00      0.00        23
      verbal       0.00      0.00      0.00        27

    accuracy 

  _warn_prf(average, modifier, msg_start, len(result))


In [42]:
idx = range(100, X.shape[0], 300)
scores = { }
for i in idx: 
    print(i)
    acc, f1 = prediction(X[:i,], y[:i], n=0.2, cv=True, binary=False)
    scores[i] = [acc , f1]

100
shape of X: (100, 7239)
shape of y: (100,)
shape of X_train: (75, 7239)
shape of y_train: (75,)
shape of X_test: (25, 7239)
shape of y_test: (25,)




shape of y_pred: (25, 1)
              precision    recall  f1-score   support

       cyber       0.54      0.78      0.64         9
     general       0.75      0.64      0.69        14
    physical       0.00      0.00      0.00         1
      verbal       0.00      0.00      0.00         1

    accuracy                           0.64        25
   macro avg       0.32      0.36      0.33        25
weighted avg       0.61      0.64      0.62        25

[[7 2 0 0]
 [5 9 0 0]
 [0 1 0 0]
 [1 0 0 0]]
naive model (only no)
              precision    recall  f1-score   support

       cyber       0.36      1.00      0.53         9
     general       0.00      0.00      0.00        14
    physical       0.00      0.00      0.00         1
      verbal       0.00      0.00      0.00         1

    accuracy                           0.36        25
   macro avg       0.09      0.25      0.13        25
weighted avg       0.13      0.36      0.19        25

[[ 0 14]
 [ 0  9]]
400
shape of X: (40

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (100, 1)
              precision    recall  f1-score   support

       cyber       0.60      0.50      0.55        30
     general       0.69      0.87      0.77        60
    physical       0.00      0.00      0.00         5
      verbal       0.00      0.00      0.00         5

    accuracy                           0.67       100
   macro avg       0.32      0.34      0.33       100
weighted avg       0.60      0.67      0.63       100

[[15 15  0  0]
 [ 8 52  0  0]
 [ 1  4  0  0]
 [ 1  4  0  0]]
naive model (only no)
              precision    recall  f1-score   support

       cyber       0.30      1.00      0.46        30
     general       0.00      0.00      0.00        60
    physical       0.00      0.00      0.00         5
      verbal       0.00      0.00      0.00         5

    accuracy                           0.30       100
   macro avg       0.07      0.25      0.12       100
weighted avg       0.09      0.30      0.14       100

[[ 0 60]
 [ 0 30]]
70

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (175, 1)
              precision    recall  f1-score   support

       cyber       0.63      0.60      0.62        53
     general       0.77      0.86      0.81       112
    physical       0.00      0.00      0.00         5
      verbal       0.00      0.00      0.00         5

    accuracy                           0.73       175
   macro avg       0.35      0.37      0.36       175
weighted avg       0.69      0.73      0.71       175

[[32 21  0  0]
 [16 96  0  0]
 [ 0  5  0  0]
 [ 3  2  0  0]]
naive model (only no)
              precision    recall  f1-score   support

       cyber       0.30      1.00      0.46        53
     general       0.00      0.00      0.00       112
    physical       0.00      0.00      0.00         5
      verbal       0.00      0.00      0.00         5

    accuracy                           0.30       175
   macro avg       0.08      0.25      0.12       175
weighted avg       0.09      0.30      0.14       175

[[  0 112]
 [  0  53]

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (250, 1)
              precision    recall  f1-score   support

       cyber       0.67      0.68      0.68        78
     general       0.78      0.83      0.81       157
    physical       0.00      0.00      0.00         7
      verbal       0.50      0.12      0.20         8

    accuracy                           0.74       250
   macro avg       0.49      0.41      0.42       250
weighted avg       0.72      0.74      0.72       250

[[ 53  25   0   0]
 [ 24 131   1   1]
 [  0   7   0   0]
 [  2   5   0   1]]
naive model (only no)
              precision    recall  f1-score   support

       cyber       0.31      1.00      0.48        78
     general       0.00      0.00      0.00       157
    physical       0.00      0.00      0.00         7
      verbal       0.00      0.00      0.00         8

    accuracy                           0.31       250
   macro avg       0.08      0.25      0.12       250
weighted avg       0.10      0.31      0.15       250

[[  0

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (325, 1)
              precision    recall  f1-score   support

       cyber       0.70      0.58      0.63       106
     general       0.72      0.88      0.79       194
    physical       1.00      0.09      0.17        11
      verbal       0.00      0.00      0.00        14

    accuracy                           0.71       325
   macro avg       0.60      0.39      0.40       325
weighted avg       0.69      0.71      0.68       325

[[ 61  45   0   0]
 [ 24 170   0   0]
 [  0  10   1   0]
 [  2  12   0   0]]
naive model (only no)
              precision    recall  f1-score   support

       cyber       0.33      1.00      0.49       106
     general       0.00      0.00      0.00       194
    physical       0.00      0.00      0.00        11
      verbal       0.00      0.00      0.00        14

    accuracy                           0.33       325
   macro avg       0.08      0.25      0.12       325
weighted avg       0.11      0.33      0.16       325

[[  0

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (400, 1)
              precision    recall  f1-score   support

       cyber       0.74      0.63      0.68       132
     general       0.72      0.86      0.78       233
    physical       0.00      0.00      0.00        14
      verbal       0.20      0.05      0.08        21

    accuracy                           0.71       400
   macro avg       0.41      0.38      0.38       400
weighted avg       0.67      0.71      0.68       400

[[ 83  48   0   1]
 [ 27 201   2   3]
 [  1  13   0   0]
 [  1  19   0   1]]
naive model (only no)
              precision    recall  f1-score   support

       cyber       0.33      1.00      0.50       132
     general       0.00      0.00      0.00       233
    physical       0.00      0.00      0.00        14
      verbal       0.00      0.00      0.00        21

    accuracy                           0.33       400
   macro avg       0.08      0.25      0.12       400
weighted avg       0.11      0.33      0.16       400

[[  0

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (475, 1)
              precision    recall  f1-score   support

       cyber       0.80      0.68      0.73       156
     general       0.75      0.89      0.81       280
    physical       0.67      0.12      0.21        16
      verbal       0.25      0.04      0.07        23

    accuracy                           0.76       475
   macro avg       0.61      0.44      0.46       475
weighted avg       0.74      0.76      0.73       475

[[106  49   0   1]
 [ 27 250   1   2]
 [  0  14   2   0]
 [  0  22   0   1]]
naive model (only no)
              precision    recall  f1-score   support

       cyber       0.33      1.00      0.49       156
     general       0.00      0.00      0.00       280
    physical       0.00      0.00      0.00        16
      verbal       0.00      0.00      0.00        23

    accuracy                           0.33       475
   macro avg       0.08      0.25      0.12       475
weighted avg       0.11      0.33      0.16       475

[[  0

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (550, 1)
              precision    recall  f1-score   support

       cyber       0.78      0.67      0.73       184
     general       0.76      0.89      0.82       326
    physical       0.67      0.12      0.20        17
      verbal       0.00      0.00      0.00        23

    accuracy                           0.76       550
   macro avg       0.55      0.42      0.44       550
weighted avg       0.73      0.76      0.73       550

[[124  59   0   1]
 [ 31 291   1   3]
 [  1  14   2   0]
 [  2  21   0   0]]
naive model (only no)
              precision    recall  f1-score   support

       cyber       0.33      1.00      0.50       184
     general       0.00      0.00      0.00       326
    physical       0.00      0.00      0.00        17
      verbal       0.00      0.00      0.00        23

    accuracy                           0.33       550
   macro avg       0.08      0.25      0.13       550
weighted avg       0.11      0.33      0.17       550

[[  0

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (625, 1)
              precision    recall  f1-score   support

       cyber       0.77      0.66      0.71       208
     general       0.75      0.88      0.81       372
    physical       0.17      0.05      0.08        20
      verbal       0.00      0.00      0.00        25

    accuracy                           0.75       625
   macro avg       0.42      0.40      0.40       625
weighted avg       0.71      0.75      0.72       625

[[138  70   0   0]
 [ 37 329   5   1]
 [  2  17   1   0]
 [  2  23   0   0]]
naive model (only no)
              precision    recall  f1-score   support

       cyber       0.33      1.00      0.50       208
     general       0.00      0.00      0.00       372
    physical       0.00      0.00      0.00        20
      verbal       0.00      0.00      0.00        25

    accuracy                           0.33       625
   macro avg       0.08      0.25      0.12       625
weighted avg       0.11      0.33      0.17       625

[[  0

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (700, 1)
              precision    recall  f1-score   support

       cyber       0.74      0.68      0.71       237
     general       0.75      0.86      0.80       415
    physical       0.43      0.14      0.21        22
      verbal       0.00      0.00      0.00        26

    accuracy                           0.74       700
   macro avg       0.48      0.42      0.43       700
weighted avg       0.71      0.74      0.72       700

[[160  76   0   1]
 [ 54 355   4   2]
 [  0  19   3   0]
 [  2  24   0   0]]
naive model (only no)
              precision    recall  f1-score   support

       cyber       0.34      1.00      0.51       237
     general       0.00      0.00      0.00       415
    physical       0.00      0.00      0.00        22
      verbal       0.00      0.00      0.00        26

    accuracy                           0.34       700
   macro avg       0.08      0.25      0.13       700
weighted avg       0.11      0.34      0.17       700

[[  0

  _warn_prf(average, modifier, msg_start, len(result))


In [43]:
scores

{100: [0.64, 0.6167832167832168],
 400: [0.67, 0.6258585858585859],
 700: [0.7314285714285714, 0.7070515924753213],
 1000: [0.74, 0.7233142969132779],
 1300: [0.7138461538461538, 0.682701408725957],
 1600: [0.7125, 0.6841205882208626],
 1900: [0.7557894736842106, 0.7308429207121454],
 2200: [0.7581818181818182, 0.7339622204725468],
 2500: [0.7488, 0.7227192354381871],
 2800: [0.74, 0.7191553752906781]}

## TDIDF Vectorizer ## 

In [5]:
def identity(words):
    return words
vectorizer = TfidfVectorizer(tokenizer=identity, encoding='utf-8', preprocessor=None, use_idf=True,
                             lowercase=False, ngram_range=(1,2)
                             , stop_words='english',
                             min_df=5, max_df=0.7)

In [6]:
tfidf_fit = vectorizer.fit_transform([tweet for tweet in tweets['full_tweet']])
X = tfidf_fit
X.shape

  'stop_words.' % sorted(inconsistent))


(3089, 2084)

In [30]:
df_idf = pd.DataFrame(vectorizer.idf_, index=vectorizer.get_feature_names(), columns=['idf_weights'])
weights = df_idf.sort_values(by=['idf_weights'], ascending=False)
weights.to_csv('tdidfweights_'+target+'.csv', index=True)
# weights.to_csv('tdidfweights_noemoji.csv', index=True)
weights
# the lower the idf value of a word, the less unique it is to any particular document
# terms with higher weight scores are considered to be more important

Unnamed: 0,idf_weights
lie until,8.342779
online persona,8.342779
one support,8.342779
one syllable,8.342779
one target,8.342779
...,...
and,1.545397
to,1.518949
@user,1.497964
be,1.278875


In [31]:
# tdidf score of first tweet
# if a word occurs multiple times in a document, we should boost its relevance as it should be 
# more meaningful than other words that appear fewer times (TF)
# On the other hand, if a word occurs many times in all documents, maybe it is just a frequent word
vector = pd.DataFrame(X[1].T.todense(), index=vectorizer.get_feature_names(), columns=['tdidf'])
vector.sort_values(by=['tdidf'], ascending=False)

Unnamed: 0,tdidf
suck,0.301374
everything suck,0.284080
everything,0.187606
because,0.157586
because everything,0.142040
...,...
from female,0.000000
from get,0.000000
from happen,0.000000
from head,0.000000


In [32]:
D = vectorizer.vocabulary_
max_word = max(D, key=D.get)
max_value = max(D.values())
print(max_word, max_value)

~ you 57873


In [18]:
acc, f1 = prediction(X, y, n=0.2, binary=False)
print(acc, f1)

shape of X: (3089, 2084)
shape of y: (3089,)
shape of X_train: (2316, 2084)
shape of y_train: (2316,)
shape of X_test: (773, 2084)
shape of y_test: (773,)
shape of y_pred: (773,)
              precision    recall  f1-score   support

           0       0.82      0.70      0.76       265
           1       0.76      0.91      0.83       458
           2       0.00      0.00      0.00        23
           3       0.00      0.00      0.00        27

    accuracy                           0.78       773
   macro avg       0.40      0.40      0.40       773
weighted avg       0.73      0.78      0.75       773

[[185  80   0   0]
 [ 39 419   0   0]
 [  0  23   0   0]
 [  1  26   0   0]]
naive model (only no)
              precision    recall  f1-score   support

           0       0.34      1.00      0.51       265
           1       0.00      0.00      0.00       458
           2       0.00      0.00      0.00        23
           3       0.00      0.00      0.00        27

    accuracy   

  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
def identity(words):
    return words
vectorizer = TfidfVectorizer(tokenizer=identity, encoding='utf-8', preprocessor=None, use_idf=True,
                             lowercase=False, ngram_range=(1,2)
                             , stop_words='english',
                             min_df=5, max_df=0.7)

tfidf_fit = vectorizer.fit_transform([tweet for tweet in tweets['full_tweet']])
X = tfidf_fit
X.shape

y = tweets[target]
freq = y.value_counts()           # count frequency of different classes in loan status
freq/sum(freq)*100   

labels = LabelEncoder()
y = labels.fit_transform( np.asarray(y) )#.reshape(-1,1)
names = labels.classes_
print("shape of X:", X.shape)
print("shape of y:", y.shape)

X_train, X_test, y_train, y_test = tts(X, y, random_state=0, stratify=y, shuffle=True)
print("shape of X_train:", X_train.shape)
print("shape of y_train:", y_train.shape)
print("shape of X_test:", X_test.shape)
print("shape of y_test:", y_test.shape)

from sklearn import svm
from sklearn.model_selection import GridSearchCV

def svc_param_selection(X, y, nfolds):
    Cs = [0.001, 0.01, 0.1, 1, 10]
    gammas = [0.001, 0.01, 0.1, 1]
    param_grid = {'C': Cs, 'gamma' : gammas}
    grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    
    return grid_search.best_params_

param_dict = svc_param_selection(X, y,nfolds=10)

clf = svm.SVC(kernel='rbf', C=param_dict['C'], gamma=param_dict['gamma'])
# SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
#     decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf', max_iter=-1,
#     probability=False, random_state=None, shrinking=True, tol=0.001,
#     verbose=False)

  'stop_words.' % sorted(inconsistent))


shape of X: (3089, 2084)
shape of y: (3089,)
shape of X_train: (2316, 2084)
shape of y_train: (2316,)
shape of X_test: (773, 2084)
shape of y_test: (773,)


In [8]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test).reshape(-1,1)
print("shape of y_pred:", y_pred.shape)
print(clsr(y_test, y_pred, target_names=names))
print(cm(y_test, y_pred, labels=[0,1,2,3]))
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
# f1 = f1_score(y_test, y_pred, labels=np.unique(y_pred))
print(acc, f1)

print('naive model (only no)')
y_naive = np.array(['general']*len(y_test))
y_naive = labels.fit_transform(y_naive)#.reshape(-1,1)
print(clsr(y_test, y_naive, target_names=names))
print(cm(y_test, y_naive, labels=[1,0]))

shape of y_pred: (773, 1)
              precision    recall  f1-score   support

       cyber       0.82      0.70      0.76       265
     general       0.76      0.91      0.83       458
    physical       0.00      0.00      0.00        23
      verbal       0.00      0.00      0.00        27

    accuracy                           0.78       773
   macro avg       0.40      0.40      0.40       773
weighted avg       0.73      0.78      0.75       773

[[185  80   0   0]
 [ 39 419   0   0]
 [  0  23   0   0]
 [  1  26   0   0]]
0.7813712807244502 0.8044494486211698
naive model (only no)
              precision    recall  f1-score   support

       cyber       0.34      1.00      0.51       265
     general       0.00      0.00      0.00       458
    physical       0.00      0.00      0.00        23
      verbal       0.00      0.00      0.00        27

    accuracy                           0.34       773
   macro avg       0.09      0.25      0.13       773
weighted avg       0.1

  _warn_prf(average, modifier, msg_start, len(result))


In [85]:
acc, f1 = prediction(X, y, n=0.2, cv=True, binary=False)
print(acc, f1)

shape of X: (3089, 57874)
shape of y: (3089,)
shape of X_train: (2316, 57874)
shape of y_train: (2316,)
shape of X_test: (773, 57874)
shape of y_test: (773,)
shape of y_pred: (773, 1)
              precision    recall  f1-score   support

       cyber       0.84      0.57      0.68       265
     general       0.72      0.93      0.82       458
    physical       0.00      0.00      0.00        23
      verbal       0.00      0.00      0.00        27

    accuracy                           0.75       773
   macro avg       0.39      0.38      0.37       773
weighted avg       0.72      0.75      0.72       773

[[152 113   0   0]
 [ 30 428   0   0]
 [  0  23   0   0]
 [  0  27   0   0]]
naive model (only no)
              precision    recall  f1-score   support

       cyber       0.34      1.00      0.51       265
     general       0.00      0.00      0.00       458
    physical       0.00      0.00      0.00        23
      verbal       0.00      0.00      0.00        27

    accura

  _warn_prf(average, modifier, msg_start, len(result))


In [86]:
idx = range(100, X.shape[0], 300)
scores_tfidf = { }
for i in idx: 
    print(i)
    acc, f1 = prediction(X[:i,], y[:i], n=0.2, cv=True, binary=False)
    scores_tfidf[i] = [acc , f1]

100
shape of X: (100, 57874)
shape of y: (100,)
shape of X_train: (75, 57874)
shape of y_train: (75,)
shape of X_test: (25, 57874)
shape of y_test: (25,)
shape of y_pred: (25, 1)
              precision    recall  f1-score   support

       cyber       0.00      0.00      0.00         9
     general       0.56      1.00      0.72        14
    physical       0.00      0.00      0.00         1
      verbal       0.00      0.00      0.00         1

    accuracy                           0.56        25
   macro avg       0.14      0.25      0.18        25
weighted avg       0.31      0.56      0.40        25

[[ 0  9  0  0]
 [ 0 14  0  0]
 [ 0  1  0  0]
 [ 0  1  0  0]]
naive model (only no)
              precision    recall  f1-score   support

       cyber       0.36      1.00      0.53         9
     general       0.00      0.00      0.00        14
    physical       0.00      0.00      0.00         1
      verbal       0.00      0.00      0.00         1

    accuracy                   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (175, 1)
              precision    recall  f1-score   support

       cyber       0.91      0.19      0.31        53
     general       0.68      0.99      0.80       112
    physical       0.00      0.00      0.00         5
      verbal       0.00      0.00      0.00         5

    accuracy                           0.69       175
   macro avg       0.40      0.29      0.28       175
weighted avg       0.71      0.69      0.61       175

[[ 10  43   0   0]
 [  1 111   0   0]
 [  0   5   0   0]
 [  0   5   0   0]]
naive model (only no)
              precision    recall  f1-score   support

       cyber       0.30      1.00      0.46        53
     general       0.00      0.00      0.00       112
    physical       0.00      0.00      0.00         5
      verbal       0.00      0.00      0.00         5

    accuracy                           0.30       175
   macro avg       0.08      0.25      0.12       175
weighted avg       0.09      0.30      0.14       175

[[  0

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (250, 1)
              precision    recall  f1-score   support

       cyber       0.96      0.33      0.50        78
     general       0.70      0.99      0.82       157
    physical       0.00      0.00      0.00         7
      verbal       0.00      0.00      0.00         8

    accuracy                           0.73       250
   macro avg       0.42      0.33      0.33       250
weighted avg       0.74      0.73      0.67       250

[[ 26  52   0   0]
 [  1 156   0   0]
 [  0   7   0   0]
 [  0   8   0   0]]
naive model (only no)
              precision    recall  f1-score   support

       cyber       0.31      1.00      0.48        78
     general       0.00      0.00      0.00       157
    physical       0.00      0.00      0.00         7
      verbal       0.00      0.00      0.00         8

    accuracy                           0.31       250
   macro avg       0.08      0.25      0.12       250
weighted avg       0.10      0.31      0.15       250

[[  0

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (325, 1)
              precision    recall  f1-score   support

       cyber       0.91      0.40      0.55       106
     general       0.68      0.98      0.80       194
    physical       0.00      0.00      0.00        11
      verbal       0.00      0.00      0.00        14

    accuracy                           0.71       325
   macro avg       0.40      0.34      0.34       325
weighted avg       0.70      0.71      0.66       325

[[ 42  64   0   0]
 [  4 190   0   0]
 [  0  11   0   0]
 [  0  14   0   0]]
naive model (only no)
              precision    recall  f1-score   support

       cyber       0.33      1.00      0.49       106
     general       0.00      0.00      0.00       194
    physical       0.00      0.00      0.00        11
      verbal       0.00      0.00      0.00        14

    accuracy                           0.33       325
   macro avg       0.08      0.25      0.12       325
weighted avg       0.11      0.33      0.16       325

[[  0

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (400, 1)
              precision    recall  f1-score   support

       cyber       0.86      0.42      0.57       132
     general       0.67      0.96      0.79       233
    physical       0.00      0.00      0.00        14
      verbal       0.00      0.00      0.00        21

    accuracy                           0.70       400
   macro avg       0.38      0.35      0.34       400
weighted avg       0.67      0.70      0.65       400

[[ 56  76   0   0]
 [  9 224   0   0]
 [  0  14   0   0]
 [  0  21   0   0]]
naive model (only no)
              precision    recall  f1-score   support

       cyber       0.33      1.00      0.50       132
     general       0.00      0.00      0.00       233
    physical       0.00      0.00      0.00        14
      verbal       0.00      0.00      0.00        21

    accuracy                           0.33       400
   macro avg       0.08      0.25      0.12       400
weighted avg       0.11      0.33      0.16       400

[[  0

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (475, 1)
              precision    recall  f1-score   support

       cyber       0.88      0.48      0.62       156
     general       0.69      0.96      0.81       280
    physical       0.00      0.00      0.00        16
      verbal       0.00      0.00      0.00        23

    accuracy                           0.73       475
   macro avg       0.39      0.36      0.36       475
weighted avg       0.70      0.73      0.68       475

[[ 75  81   0   0]
 [ 10 270   0   0]
 [  0  16   0   0]
 [  0  23   0   0]]
naive model (only no)
              precision    recall  f1-score   support

       cyber       0.33      1.00      0.49       156
     general       0.00      0.00      0.00       280
    physical       0.00      0.00      0.00        16
      verbal       0.00      0.00      0.00        23

    accuracy                           0.33       475
   macro avg       0.08      0.25      0.12       475
weighted avg       0.11      0.33      0.16       475

[[  0

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (550, 1)
              precision    recall  f1-score   support

       cyber       0.88      0.48      0.62       184
     general       0.70      0.96      0.81       326
    physical       0.00      0.00      0.00        17
      verbal       0.00      0.00      0.00        23

    accuracy                           0.73       550
   macro avg       0.39      0.36      0.36       550
weighted avg       0.71      0.73      0.69       550

[[ 88  96   0   0]
 [ 12 314   0   0]
 [  0  17   0   0]
 [  0  23   0   0]]
naive model (only no)
              precision    recall  f1-score   support

       cyber       0.33      1.00      0.50       184
     general       0.00      0.00      0.00       326
    physical       0.00      0.00      0.00        17
      verbal       0.00      0.00      0.00        23

    accuracy                           0.33       550
   macro avg       0.08      0.25      0.13       550
weighted avg       0.11      0.33      0.17       550

[[  0

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (625, 1)
              precision    recall  f1-score   support

       cyber       0.91      0.51      0.66       208
     general       0.71      0.97      0.82       372
    physical       0.00      0.00      0.00        20
      verbal       0.00      0.00      0.00        25

    accuracy                           0.75       625
   macro avg       0.41      0.37      0.37       625
weighted avg       0.73      0.75      0.71       625

[[107 101   0   0]
 [ 10 362   0   0]
 [  0  20   0   0]
 [  0  25   0   0]]
naive model (only no)
              precision    recall  f1-score   support

       cyber       0.33      1.00      0.50       208
     general       0.00      0.00      0.00       372
    physical       0.00      0.00      0.00        20
      verbal       0.00      0.00      0.00        25

    accuracy                           0.33       625
   macro avg       0.08      0.25      0.12       625
weighted avg       0.11      0.33      0.17       625

[[  0

  _warn_prf(average, modifier, msg_start, len(result))


shape of y_pred: (700, 1)
              precision    recall  f1-score   support

       cyber       0.86      0.57      0.69       237
     general       0.72      0.94      0.82       415
    physical       0.00      0.00      0.00        22
      verbal       0.00      0.00      0.00        26

    accuracy                           0.75       700
   macro avg       0.39      0.38      0.38       700
weighted avg       0.72      0.75      0.72       700

[[136 101   0   0]
 [ 23 392   0   0]
 [  0  22   0   0]
 [  0  26   0   0]]
naive model (only no)
              precision    recall  f1-score   support

       cyber       0.34      1.00      0.51       237
     general       0.00      0.00      0.00       415
    physical       0.00      0.00      0.00        22
      verbal       0.00      0.00      0.00        26

    accuracy                           0.34       700
   macro avg       0.08      0.25      0.13       700
weighted avg       0.11      0.34      0.17       700

[[  0

  _warn_prf(average, modifier, msg_start, len(result))


In [87]:
scores_tfidf

{100: [0.56, 0.717948717948718],
 400: [0.59, 0.49475890985324933],
 700: [0.6914285714285714, 0.6463603425559947],
 1000: [0.728, 0.7129099344104943],
 1300: [0.7138461538461538, 0.7147839471829678],
 1600: [0.7, 0.7090968031959227],
 1900: [0.7263157894736842, 0.7402914621002112],
 2200: [0.730909090909091, 0.7408880613375015],
 2500: [0.7504, 0.7638181818181817],
 2800: [0.7542857142857143, 0.7716604399100484]}

In [None]:
from sklearn import svm, grid_searchdef svc_param_selection(X, y, nfolds):
    Cs = [0.001, 0.01, 0.1, 1, 10]
    gammas = [0.001, 0.01, 0.1, 1]
    param_grid = {'C': Cs, 'gamma' : gammas}
    grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_