In [13]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import PredefinedSplit
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV,cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [2]:
data = pd.read_csv('dataset/train_preprocessed.csv')
data.dropna(inplace = True)

In [7]:
def vectorize(data):
    tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), lowercase=True, max_features=8000)
    fit_data=tfidf.fit_transform(data)
    return fit_data

In [8]:
train_set = vectorize(data['text'])

In [9]:
train_stratify_y = data['class']
train_x, test_x, train_y, test_y = train_test_split(train_set,train_stratify_y,test_size=0.2,random_state=42,stratify=train_stratify_y)
train_x.shape,train_y.shape,test_x.shape,test_y.shape

((15485, 8000), (15485,), (3872, 8000), (3872,))

In [10]:
model=SVC()
model.fit(train_x,train_y)
pred=model.predict(test_x)

print(f"Accuracy Score for SVC  ={accuracy_score(test_y,pred)}")

Accuracy Score for SVC  =0.8207644628099173


In [77]:
models=[]
models.append(('LogisticRregression',LogisticRegression(C=1.0,penalty='l2', max_iter=1000)))
models.append(('KNearestNeighbors',KNeighborsClassifier()))
models.append(('DecisionTree',DecisionTreeClassifier(criterion='entropy')))
#models.append(('GradientBoostClassifier',GradientBoostingClassifier(learning_rate=1e-2, loss='deviance',n_estimators=100)))
#models.append(('AdaBoostClassifier',AdaBoostClassifier(learning_rate=1e-2,algorithm='SAMME.R',n_estimators=100)))
models.append(('SupportVectorClassifier',SVC(C=1.0,kernel='sigmoid')))
model_result=[]
scoring='accuracy'
model_training_result,model_validation_result=[],[]
print("Statistical Model TFIDF- Baseline Evaluation")
for name,model in models:
    kfold=KFold(n_splits=10)
    results=cross_val_score(model,train_x,train_y,cv=kfold)
    print("=======================")
    print("Classifiers: ",name, "Has a training score of", round(results.mean(), 2) * 100, "% accuracy score")
    #model_result.append(results.mean())
    
    predictions=cross_val_predict(model,test_x,test_y)
    accuracy = accuracy_score(predictions,test_y)
    model_training_result.append(results.mean())
    model_validation_result.append(accuracy)

final_outcomes=pd.DataFrame(columns=['Model','Training Acc','Validation Acc'])
final_outcomes['Model']=models
final_outcomes['Training Acc']=model_training_result
final_outcomes['Validation Acc']=model_validation_result

Statistical Model TFIDF- Baseline Evaluation
Classifiers:  LogisticRregression Has a training score of 79.0 % accuracy score
Classifiers:  KNearestNeighbors Has a training score of 65.0 % accuracy score
Classifiers:  DecisionTree Has a training score of 69.0 % accuracy score
Classifiers:  SupportVectorClassifier Has a training score of 80.0 % accuracy score


In [78]:
final_outcomes

Unnamed: 0,Model,Training Acc,Validation Acc
0,"(LogisticRregression, LogisticRegression(max_i...",0.792251,0.705062
1,"(KNearestNeighbors, KNeighborsClassifier())",0.652051,0.488895
2,"(DecisionTree, DecisionTreeClassifier(criterio...",0.693833,0.623192
3,"(SupportVectorClassifier, SVC(kernel='sigmoid'))",0.799485,0.744835


In [91]:
clf = make_pipeline(StandardScaler(with_mean=False), SVC(gamma='auto', kernel='sigmoid', C=1))
clf.fit(train_x, train_y)
pred = clf.predict(test_x)

acc = accuracy_score(test_y, pred)


In [11]:
result = classification_report(test_y, pred)
print(result)

              precision    recall  f1-score   support

           0       0.74      0.61      0.67       480
           1       0.93      0.75      0.83        88
           2       0.82      0.93      0.87      1573
           3       0.80      0.42      0.55        67
           4       0.73      0.73      0.73       112
           5       0.85      0.88      0.87       942
           6       0.79      0.69      0.73       316
           7       0.88      0.75      0.81       134
           8       0.89      0.63      0.74       147
           9       1.00      0.85      0.92        13

    accuracy                           0.82      3872
   macro avg       0.84      0.72      0.77      3872
weighted avg       0.82      0.82      0.82      3872



In [64]:
train_stratify_y = data['class']
train_x, test_x, train_y, test_y = train_test_split(data['text'],train_stratify_y,test_size=0.2,random_state=42,stratify=train_stratify_y)
train_x.shape,train_y.shape,test_x.shape,test_y.shape

((15485,), (15485,), (3872,), (3872,))

In [23]:
svm_pipe = Pipeline([('tfidf', TfidfVectorizer()),
                     ('svm', SVC())])
params = [{ 'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
         'tfidf__max_features': [5000, 8000, 11000, 15000],   
         'svm__kernel': ['linear', 'poly', 'rbf','sigmoid'],
         'svm__C': [0.5, 1, 2]}]

#print(svm_pipe.get_params().keys())
gs_svm = GridSearchCV(svm_pipe,
                      param_grid=params,
                      scoring='accuracy',
                      cv=4,
                      verbose=10)


gs_svm.fit(train_x, train_y)
print(gs_svm.score(test_x, test_y))

Fitting 4 folds for each of 144 candidates, totalling 576 fits
[CV 1/4; 1/144] START svm__C=0.5, svm__kernel=linear, tfidf__max_features=5000, tfidf__ngram_range=(1, 1)
[CV 1/4; 1/144] END svm__C=0.5, svm__kernel=linear, tfidf__max_features=5000, tfidf__ngram_range=(1, 1);, score=0.793 total time=  15.0s
[CV 2/4; 1/144] START svm__C=0.5, svm__kernel=linear, tfidf__max_features=5000, tfidf__ngram_range=(1, 1)
[CV 2/4; 1/144] END svm__C=0.5, svm__kernel=linear, tfidf__max_features=5000, tfidf__ngram_range=(1, 1);, score=0.770 total time=  14.7s
[CV 3/4; 1/144] START svm__C=0.5, svm__kernel=linear, tfidf__max_features=5000, tfidf__ngram_range=(1, 1)
[CV 3/4; 1/144] END svm__C=0.5, svm__kernel=linear, tfidf__max_features=5000, tfidf__ngram_range=(1, 1);, score=0.789 total time=  14.9s
[CV 4/4; 1/144] START svm__C=0.5, svm__kernel=linear, tfidf__max_features=5000, tfidf__ngram_range=(1, 1)
[CV 4/4; 1/144] END svm__C=0.5, svm__kernel=linear, tfidf__max_features=5000, tfidf__ngram_range=(1, 1

[CV 2/4; 9/144] END svm__C=0.5, svm__kernel=linear, tfidf__max_features=11000, tfidf__ngram_range=(1, 3);, score=0.781 total time=  23.0s
[CV 3/4; 9/144] START svm__C=0.5, svm__kernel=linear, tfidf__max_features=11000, tfidf__ngram_range=(1, 3)
[CV 3/4; 9/144] END svm__C=0.5, svm__kernel=linear, tfidf__max_features=11000, tfidf__ngram_range=(1, 3);, score=0.795 total time=  23.2s
[CV 4/4; 9/144] START svm__C=0.5, svm__kernel=linear, tfidf__max_features=11000, tfidf__ngram_range=(1, 3)
[CV 4/4; 9/144] END svm__C=0.5, svm__kernel=linear, tfidf__max_features=11000, tfidf__ngram_range=(1, 3);, score=0.797 total time=  23.1s
[CV 1/4; 10/144] START svm__C=0.5, svm__kernel=linear, tfidf__max_features=15000, tfidf__ngram_range=(1, 1)
[CV 1/4; 10/144] END svm__C=0.5, svm__kernel=linear, tfidf__max_features=15000, tfidf__ngram_range=(1, 1);, score=0.795 total time=  15.4s
[CV 2/4; 10/144] START svm__C=0.5, svm__kernel=linear, tfidf__max_features=15000, tfidf__ngram_range=(1, 1)
[CV 2/4; 10/144] 

[CV 4/4; 17/144] END svm__C=0.5, svm__kernel=poly, tfidf__max_features=8000, tfidf__ngram_range=(1, 2);, score=0.604 total time= 1.1min
[CV 1/4; 18/144] START svm__C=0.5, svm__kernel=poly, tfidf__max_features=8000, tfidf__ngram_range=(1, 3)
[CV 1/4; 18/144] END svm__C=0.5, svm__kernel=poly, tfidf__max_features=8000, tfidf__ngram_range=(1, 3);, score=0.621 total time= 1.1min
[CV 2/4; 18/144] START svm__C=0.5, svm__kernel=poly, tfidf__max_features=8000, tfidf__ngram_range=(1, 3)
[CV 2/4; 18/144] END svm__C=0.5, svm__kernel=poly, tfidf__max_features=8000, tfidf__ngram_range=(1, 3);, score=0.614 total time= 1.1min
[CV 3/4; 18/144] START svm__C=0.5, svm__kernel=poly, tfidf__max_features=8000, tfidf__ngram_range=(1, 3)
[CV 3/4; 18/144] END svm__C=0.5, svm__kernel=poly, tfidf__max_features=8000, tfidf__ngram_range=(1, 3);, score=0.619 total time= 1.3min
[CV 4/4; 18/144] START svm__C=0.5, svm__kernel=poly, tfidf__max_features=8000, tfidf__ngram_range=(1, 3)
[CV 4/4; 18/144] END svm__C=0.5, svm

[CV 2/4; 26/144] END svm__C=0.5, svm__kernel=rbf, tfidf__max_features=5000, tfidf__ngram_range=(1, 2);, score=0.763 total time=  28.5s
[CV 3/4; 26/144] START svm__C=0.5, svm__kernel=rbf, tfidf__max_features=5000, tfidf__ngram_range=(1, 2)
[CV 3/4; 26/144] END svm__C=0.5, svm__kernel=rbf, tfidf__max_features=5000, tfidf__ngram_range=(1, 2);, score=0.774 total time=  28.6s
[CV 4/4; 26/144] START svm__C=0.5, svm__kernel=rbf, tfidf__max_features=5000, tfidf__ngram_range=(1, 2)
[CV 4/4; 26/144] END svm__C=0.5, svm__kernel=rbf, tfidf__max_features=5000, tfidf__ngram_range=(1, 2);, score=0.777 total time=  28.8s
[CV 1/4; 27/144] START svm__C=0.5, svm__kernel=rbf, tfidf__max_features=5000, tfidf__ngram_range=(1, 3)
[CV 1/4; 27/144] END svm__C=0.5, svm__kernel=rbf, tfidf__max_features=5000, tfidf__ngram_range=(1, 3);, score=0.783 total time=  29.7s
[CV 2/4; 27/144] START svm__C=0.5, svm__kernel=rbf, tfidf__max_features=5000, tfidf__ngram_range=(1, 3)
[CV 2/4; 27/144] END svm__C=0.5, svm__kernel

[CV 1/4; 35/144] END svm__C=0.5, svm__kernel=rbf, tfidf__max_features=15000, tfidf__ngram_range=(1, 2);, score=0.778 total time=  43.1s
[CV 2/4; 35/144] START svm__C=0.5, svm__kernel=rbf, tfidf__max_features=15000, tfidf__ngram_range=(1, 2)
[CV 2/4; 35/144] END svm__C=0.5, svm__kernel=rbf, tfidf__max_features=15000, tfidf__ngram_range=(1, 2);, score=0.752 total time=  39.7s
[CV 3/4; 35/144] START svm__C=0.5, svm__kernel=rbf, tfidf__max_features=15000, tfidf__ngram_range=(1, 2)
[CV 3/4; 35/144] END svm__C=0.5, svm__kernel=rbf, tfidf__max_features=15000, tfidf__ngram_range=(1, 2);, score=0.765 total time=  39.6s
[CV 4/4; 35/144] START svm__C=0.5, svm__kernel=rbf, tfidf__max_features=15000, tfidf__ngram_range=(1, 2)
[CV 4/4; 35/144] END svm__C=0.5, svm__kernel=rbf, tfidf__max_features=15000, tfidf__ngram_range=(1, 2);, score=0.768 total time=  39.7s
[CV 1/4; 36/144] START svm__C=0.5, svm__kernel=rbf, tfidf__max_features=15000, tfidf__ngram_range=(1, 3)
[CV 1/4; 36/144] END svm__C=0.5, svm

[CV 3/4; 43/144] END svm__C=0.5, svm__kernel=sigmoid, tfidf__max_features=11000, tfidf__ngram_range=(1, 1);, score=0.780 total time=  19.7s
[CV 4/4; 43/144] START svm__C=0.5, svm__kernel=sigmoid, tfidf__max_features=11000, tfidf__ngram_range=(1, 1)
[CV 4/4; 43/144] END svm__C=0.5, svm__kernel=sigmoid, tfidf__max_features=11000, tfidf__ngram_range=(1, 1);, score=0.784 total time=  19.5s
[CV 1/4; 44/144] START svm__C=0.5, svm__kernel=sigmoid, tfidf__max_features=11000, tfidf__ngram_range=(1, 2)
[CV 1/4; 44/144] END svm__C=0.5, svm__kernel=sigmoid, tfidf__max_features=11000, tfidf__ngram_range=(1, 2);, score=0.796 total time=  26.5s
[CV 2/4; 44/144] START svm__C=0.5, svm__kernel=sigmoid, tfidf__max_features=11000, tfidf__ngram_range=(1, 2)
[CV 2/4; 44/144] END svm__C=0.5, svm__kernel=sigmoid, tfidf__max_features=11000, tfidf__ngram_range=(1, 2);, score=0.771 total time=  26.1s
[CV 3/4; 44/144] START svm__C=0.5, svm__kernel=sigmoid, tfidf__max_features=11000, tfidf__ngram_range=(1, 2)
[CV 

[CV 1/4; 52/144] END svm__C=1, svm__kernel=linear, tfidf__max_features=8000, tfidf__ngram_range=(1, 1);, score=0.804 total time=  16.3s
[CV 2/4; 52/144] START svm__C=1, svm__kernel=linear, tfidf__max_features=8000, tfidf__ngram_range=(1, 1)
[CV 2/4; 52/144] END svm__C=1, svm__kernel=linear, tfidf__max_features=8000, tfidf__ngram_range=(1, 1);, score=0.781 total time=  16.7s
[CV 3/4; 52/144] START svm__C=1, svm__kernel=linear, tfidf__max_features=8000, tfidf__ngram_range=(1, 1)
[CV 3/4; 52/144] END svm__C=1, svm__kernel=linear, tfidf__max_features=8000, tfidf__ngram_range=(1, 1);, score=0.800 total time=  16.0s
[CV 4/4; 52/144] START svm__C=1, svm__kernel=linear, tfidf__max_features=8000, tfidf__ngram_range=(1, 1)
[CV 4/4; 52/144] END svm__C=1, svm__kernel=linear, tfidf__max_features=8000, tfidf__ngram_range=(1, 1);, score=0.803 total time=  16.2s
[CV 1/4; 53/144] START svm__C=1, svm__kernel=linear, tfidf__max_features=8000, tfidf__ngram_range=(1, 2)
[CV 1/4; 53/144] END svm__C=1, svm__

[CV 3/4; 60/144] END svm__C=1, svm__kernel=linear, tfidf__max_features=15000, tfidf__ngram_range=(1, 3);, score=0.810 total time=  24.2s
[CV 4/4; 60/144] START svm__C=1, svm__kernel=linear, tfidf__max_features=15000, tfidf__ngram_range=(1, 3)
[CV 4/4; 60/144] END svm__C=1, svm__kernel=linear, tfidf__max_features=15000, tfidf__ngram_range=(1, 3);, score=0.811 total time=  24.1s
[CV 1/4; 61/144] START svm__C=1, svm__kernel=poly, tfidf__max_features=5000, tfidf__ngram_range=(1, 1)
[CV 1/4; 61/144] END svm__C=1, svm__kernel=poly, tfidf__max_features=5000, tfidf__ngram_range=(1, 1);, score=0.748 total time=  52.6s
[CV 2/4; 61/144] START svm__C=1, svm__kernel=poly, tfidf__max_features=5000, tfidf__ngram_range=(1, 1)
[CV 2/4; 61/144] END svm__C=1, svm__kernel=poly, tfidf__max_features=5000, tfidf__ngram_range=(1, 1);, score=0.737 total time=  52.2s
[CV 3/4; 61/144] START svm__C=1, svm__kernel=poly, tfidf__max_features=5000, tfidf__ngram_range=(1, 1)
[CV 3/4; 61/144] END svm__C=1, svm__kernel=

[CV 2/4; 69/144] END svm__C=1, svm__kernel=poly, tfidf__max_features=11000, tfidf__ngram_range=(1, 3);, score=0.710 total time= 1.1min
[CV 3/4; 69/144] START svm__C=1, svm__kernel=poly, tfidf__max_features=11000, tfidf__ngram_range=(1, 3)
[CV 3/4; 69/144] END svm__C=1, svm__kernel=poly, tfidf__max_features=11000, tfidf__ngram_range=(1, 3);, score=0.718 total time= 1.1min
[CV 4/4; 69/144] START svm__C=1, svm__kernel=poly, tfidf__max_features=11000, tfidf__ngram_range=(1, 3)
[CV 4/4; 69/144] END svm__C=1, svm__kernel=poly, tfidf__max_features=11000, tfidf__ngram_range=(1, 3);, score=0.718 total time= 1.1min
[CV 1/4; 70/144] START svm__C=1, svm__kernel=poly, tfidf__max_features=15000, tfidf__ngram_range=(1, 1)
[CV 1/4; 70/144] END svm__C=1, svm__kernel=poly, tfidf__max_features=15000, tfidf__ngram_range=(1, 1);, score=0.747 total time=  46.3s
[CV 2/4; 70/144] START svm__C=1, svm__kernel=poly, tfidf__max_features=15000, tfidf__ngram_range=(1, 1)
[CV 2/4; 70/144] END svm__C=1, svm__kernel=p

[CV 1/4; 78/144] END svm__C=1, svm__kernel=rbf, tfidf__max_features=8000, tfidf__ngram_range=(1, 3);, score=0.814 total time=  32.2s
[CV 2/4; 78/144] START svm__C=1, svm__kernel=rbf, tfidf__max_features=8000, tfidf__ngram_range=(1, 3)
[CV 2/4; 78/144] END svm__C=1, svm__kernel=rbf, tfidf__max_features=8000, tfidf__ngram_range=(1, 3);, score=0.791 total time=  32.0s
[CV 3/4; 78/144] START svm__C=1, svm__kernel=rbf, tfidf__max_features=8000, tfidf__ngram_range=(1, 3)
[CV 3/4; 78/144] END svm__C=1, svm__kernel=rbf, tfidf__max_features=8000, tfidf__ngram_range=(1, 3);, score=0.804 total time=  32.0s
[CV 4/4; 78/144] START svm__C=1, svm__kernel=rbf, tfidf__max_features=8000, tfidf__ngram_range=(1, 3)
[CV 4/4; 78/144] END svm__C=1, svm__kernel=rbf, tfidf__max_features=8000, tfidf__ngram_range=(1, 3);, score=0.807 total time=  32.4s
[CV 1/4; 79/144] START svm__C=1, svm__kernel=rbf, tfidf__max_features=11000, tfidf__ngram_range=(1, 1)
[CV 1/4; 79/144] END svm__C=1, svm__kernel=rbf, tfidf__max_

[CV 4/4; 86/144] END svm__C=1, svm__kernel=sigmoid, tfidf__max_features=5000, tfidf__ngram_range=(1, 2);, score=0.795 total time=  18.0s
[CV 1/4; 87/144] START svm__C=1, svm__kernel=sigmoid, tfidf__max_features=5000, tfidf__ngram_range=(1, 3)
[CV 1/4; 87/144] END svm__C=1, svm__kernel=sigmoid, tfidf__max_features=5000, tfidf__ngram_range=(1, 3);, score=0.795 total time=  20.2s
[CV 2/4; 87/144] START svm__C=1, svm__kernel=sigmoid, tfidf__max_features=5000, tfidf__ngram_range=(1, 3)
[CV 2/4; 87/144] END svm__C=1, svm__kernel=sigmoid, tfidf__max_features=5000, tfidf__ngram_range=(1, 3);, score=0.776 total time=  18.7s
[CV 3/4; 87/144] START svm__C=1, svm__kernel=sigmoid, tfidf__max_features=5000, tfidf__ngram_range=(1, 3)
[CV 3/4; 87/144] END svm__C=1, svm__kernel=sigmoid, tfidf__max_features=5000, tfidf__ngram_range=(1, 3);, score=0.797 total time=  19.0s
[CV 4/4; 87/144] START svm__C=1, svm__kernel=sigmoid, tfidf__max_features=5000, tfidf__ngram_range=(1, 3)
[CV 4/4; 87/144] END svm__C=

[CV 2/4; 95/144] END svm__C=1, svm__kernel=sigmoid, tfidf__max_features=15000, tfidf__ngram_range=(1, 2);, score=0.786 total time=  21.5s
[CV 3/4; 95/144] START svm__C=1, svm__kernel=sigmoid, tfidf__max_features=15000, tfidf__ngram_range=(1, 2)
[CV 3/4; 95/144] END svm__C=1, svm__kernel=sigmoid, tfidf__max_features=15000, tfidf__ngram_range=(1, 2);, score=0.805 total time=  21.8s
[CV 4/4; 95/144] START svm__C=1, svm__kernel=sigmoid, tfidf__max_features=15000, tfidf__ngram_range=(1, 2)
[CV 4/4; 95/144] END svm__C=1, svm__kernel=sigmoid, tfidf__max_features=15000, tfidf__ngram_range=(1, 2);, score=0.808 total time=  21.6s
[CV 1/4; 96/144] START svm__C=1, svm__kernel=sigmoid, tfidf__max_features=15000, tfidf__ngram_range=(1, 3)
[CV 1/4; 96/144] END svm__C=1, svm__kernel=sigmoid, tfidf__max_features=15000, tfidf__ngram_range=(1, 3);, score=0.814 total time=  22.9s
[CV 2/4; 96/144] START svm__C=1, svm__kernel=sigmoid, tfidf__max_features=15000, tfidf__ngram_range=(1, 3)
[CV 2/4; 96/144] END

[CV 4/4; 103/144] END svm__C=2, svm__kernel=linear, tfidf__max_features=11000, tfidf__ngram_range=(1, 1);, score=0.801 total time=  12.6s
[CV 1/4; 104/144] START svm__C=2, svm__kernel=linear, tfidf__max_features=11000, tfidf__ngram_range=(1, 2)
[CV 1/4; 104/144] END svm__C=2, svm__kernel=linear, tfidf__max_features=11000, tfidf__ngram_range=(1, 2);, score=0.820 total time=  18.3s
[CV 2/4; 104/144] START svm__C=2, svm__kernel=linear, tfidf__max_features=11000, tfidf__ngram_range=(1, 2)
[CV 2/4; 104/144] END svm__C=2, svm__kernel=linear, tfidf__max_features=11000, tfidf__ngram_range=(1, 2);, score=0.803 total time=  17.8s
[CV 3/4; 104/144] START svm__C=2, svm__kernel=linear, tfidf__max_features=11000, tfidf__ngram_range=(1, 2)
[CV 3/4; 104/144] END svm__C=2, svm__kernel=linear, tfidf__max_features=11000, tfidf__ngram_range=(1, 2);, score=0.815 total time=  18.2s
[CV 4/4; 104/144] START svm__C=2, svm__kernel=linear, tfidf__max_features=11000, tfidf__ngram_range=(1, 2)
[CV 4/4; 104/144] EN

[CV 2/4; 112/144] END svm__C=2, svm__kernel=poly, tfidf__max_features=8000, tfidf__ngram_range=(1, 1);, score=0.758 total time=  47.3s
[CV 3/4; 112/144] START svm__C=2, svm__kernel=poly, tfidf__max_features=8000, tfidf__ngram_range=(1, 1)
[CV 3/4; 112/144] END svm__C=2, svm__kernel=poly, tfidf__max_features=8000, tfidf__ngram_range=(1, 1);, score=0.759 total time=  46.5s
[CV 4/4; 112/144] START svm__C=2, svm__kernel=poly, tfidf__max_features=8000, tfidf__ngram_range=(1, 1)
[CV 4/4; 112/144] END svm__C=2, svm__kernel=poly, tfidf__max_features=8000, tfidf__ngram_range=(1, 1);, score=0.759 total time=  47.1s
[CV 1/4; 113/144] START svm__C=2, svm__kernel=poly, tfidf__max_features=8000, tfidf__ngram_range=(1, 2)
[CV 1/4; 113/144] END svm__C=2, svm__kernel=poly, tfidf__max_features=8000, tfidf__ngram_range=(1, 2);, score=0.748 total time=  57.8s
[CV 2/4; 113/144] START svm__C=2, svm__kernel=poly, tfidf__max_features=8000, tfidf__ngram_range=(1, 2)
[CV 2/4; 113/144] END svm__C=2, svm__kernel=

[CV 1/4; 121/144] END svm__C=2, svm__kernel=rbf, tfidf__max_features=5000, tfidf__ngram_range=(1, 1);, score=0.820 total time=  24.6s
[CV 2/4; 121/144] START svm__C=2, svm__kernel=rbf, tfidf__max_features=5000, tfidf__ngram_range=(1, 1)
[CV 2/4; 121/144] END svm__C=2, svm__kernel=rbf, tfidf__max_features=5000, tfidf__ngram_range=(1, 1);, score=0.806 total time=  23.6s
[CV 3/4; 121/144] START svm__C=2, svm__kernel=rbf, tfidf__max_features=5000, tfidf__ngram_range=(1, 1)
[CV 3/4; 121/144] END svm__C=2, svm__kernel=rbf, tfidf__max_features=5000, tfidf__ngram_range=(1, 1);, score=0.818 total time=  23.4s
[CV 4/4; 121/144] START svm__C=2, svm__kernel=rbf, tfidf__max_features=5000, tfidf__ngram_range=(1, 1)
[CV 4/4; 121/144] END svm__C=2, svm__kernel=rbf, tfidf__max_features=5000, tfidf__ngram_range=(1, 1);, score=0.812 total time=  23.6s
[CV 1/4; 122/144] START svm__C=2, svm__kernel=rbf, tfidf__max_features=5000, tfidf__ngram_range=(1, 2)
[CV 1/4; 122/144] END svm__C=2, svm__kernel=rbf, tfi

[CV 4/4; 129/144] END svm__C=2, svm__kernel=rbf, tfidf__max_features=11000, tfidf__ngram_range=(1, 3);, score=0.816 total time=  42.5s
[CV 1/4; 130/144] START svm__C=2, svm__kernel=rbf, tfidf__max_features=15000, tfidf__ngram_range=(1, 1)
[CV 1/4; 130/144] END svm__C=2, svm__kernel=rbf, tfidf__max_features=15000, tfidf__ngram_range=(1, 1);, score=0.821 total time=  24.8s
[CV 2/4; 130/144] START svm__C=2, svm__kernel=rbf, tfidf__max_features=15000, tfidf__ngram_range=(1, 1)
[CV 2/4; 130/144] END svm__C=2, svm__kernel=rbf, tfidf__max_features=15000, tfidf__ngram_range=(1, 1);, score=0.806 total time=  24.3s
[CV 3/4; 130/144] START svm__C=2, svm__kernel=rbf, tfidf__max_features=15000, tfidf__ngram_range=(1, 1)
[CV 3/4; 130/144] END svm__C=2, svm__kernel=rbf, tfidf__max_features=15000, tfidf__ngram_range=(1, 1);, score=0.817 total time=  24.4s
[CV 4/4; 130/144] START svm__C=2, svm__kernel=rbf, tfidf__max_features=15000, tfidf__ngram_range=(1, 1)
[CV 4/4; 130/144] END svm__C=2, svm__kernel=

[CV 2/4; 138/144] END svm__C=2, svm__kernel=sigmoid, tfidf__max_features=8000, tfidf__ngram_range=(1, 3);, score=0.787 total time=  19.4s
[CV 3/4; 138/144] START svm__C=2, svm__kernel=sigmoid, tfidf__max_features=8000, tfidf__ngram_range=(1, 3)
[CV 3/4; 138/144] END svm__C=2, svm__kernel=sigmoid, tfidf__max_features=8000, tfidf__ngram_range=(1, 3);, score=0.800 total time=  22.5s
[CV 4/4; 138/144] START svm__C=2, svm__kernel=sigmoid, tfidf__max_features=8000, tfidf__ngram_range=(1, 3)
[CV 4/4; 138/144] END svm__C=2, svm__kernel=sigmoid, tfidf__max_features=8000, tfidf__ngram_range=(1, 3);, score=0.800 total time=  20.0s
[CV 1/4; 139/144] START svm__C=2, svm__kernel=sigmoid, tfidf__max_features=11000, tfidf__ngram_range=(1, 1)
[CV 1/4; 139/144] END svm__C=2, svm__kernel=sigmoid, tfidf__max_features=11000, tfidf__ngram_range=(1, 1);, score=0.794 total time=  16.1s
[CV 2/4; 139/144] START svm__C=2, svm__kernel=sigmoid, tfidf__max_features=11000, tfidf__ngram_range=(1, 1)
[CV 2/4; 139/144]

In [25]:
print(gs_svm.best_params_)

{'svm__C': 2, 'svm__kernel': 'rbf', 'tfidf__max_features': 11000, 'tfidf__ngram_range': (1, 3)}


In [65]:
tfidf = TfidfVectorizer(ngram_range=(1, 3), lowercase=True, max_features=11000)
train_x=tfidf.fit_transform(train_x)
test_x=tfidf.transform(test_x)

clf = SVC(kernel='rbf', C=2)
clf.fit(train_x, train_y)
pred = clf.predict(test_x)

In [66]:
result = classification_report(test_y, pred)
print(result)

              precision    recall  f1-score   support

           0       0.72      0.63      0.67       480
           1       0.90      0.78      0.84        88
           2       0.84      0.93      0.88      1573
           3       0.79      0.55      0.65        67
           4       0.72      0.72      0.72       112
           5       0.87      0.88      0.87       942
           6       0.80      0.73      0.76       316
           7       0.84      0.79      0.82       134
           8       0.88      0.65      0.75       147
           9       1.00      0.77      0.87        13

    accuracy                           0.83      3872
   macro avg       0.83      0.74      0.78      3872
weighted avg       0.83      0.83      0.83      3872



In [67]:
accuracy_score(test_y, pred)

0.8300619834710744