In [63]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import PredefinedSplit
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV,cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

In [4]:
data = pd.read_csv('dataset/train_preprocessed.csv')
data.dropna(inplace = True)

In [82]:
def vectorize(data):
    tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), lowercase=True, max_features=11000)
    fit_data=tfidf.fit_transform(data)
    return fit_data

In [83]:
train_set = vectorize(data['text'])

In [84]:
train_stratify_y = data['class']
train_x, test_x, train_y, test_y = train_test_split(train_set,train_stratify_y,test_size=0.2,random_state=42,stratify=train_stratify_y)
train_x.shape,train_y.shape,test_x.shape,test_y.shape

((15485, 11000), (15485,), (3872, 11000), (3872,))

In [76]:
model=SVC()
model.fit(train_x,train_y)
pred=model.predict(test_x)

print(f"Accuracy Score for SVC  ={accuracy_score(test_y,pred)}")

Accuracy Score for SVC  =0.8207644628099173


In [77]:
models=[]
models.append(('LogisticRregression',LogisticRegression(C=1.0,penalty='l2', max_iter=1000)))
models.append(('KNearestNeighbors',KNeighborsClassifier()))
models.append(('DecisionTree',DecisionTreeClassifier(criterion='entropy')))
#models.append(('GradientBoostClassifier',GradientBoostingClassifier(learning_rate=1e-2, loss='deviance',n_estimators=100)))
#models.append(('AdaBoostClassifier',AdaBoostClassifier(learning_rate=1e-2,algorithm='SAMME.R',n_estimators=100)))
models.append(('SupportVectorClassifier',SVC(C=1.0,kernel='sigmoid')))
model_result=[]
scoring='accuracy'
model_training_result,model_validation_result=[],[]
print("Statistical Model TFIDF- Baseline Evaluation")
for name,model in models:
    kfold=KFold(n_splits=10)
    results=cross_val_score(model,train_x,train_y,cv=kfold)
    print("=======================")
    print("Classifiers: ",name, "Has a training score of", round(results.mean(), 2) * 100, "% accuracy score")
    #model_result.append(results.mean())
    
    predictions=cross_val_predict(model,test_x,test_y)
    accuracy = accuracy_score(predictions,test_y)
    model_training_result.append(results.mean())
    model_validation_result.append(accuracy)

final_outcomes=pd.DataFrame(columns=['Model','Training Acc','Validation Acc'])
final_outcomes['Model']=models
final_outcomes['Training Acc']=model_training_result
final_outcomes['Validation Acc']=model_validation_result

Statistical Model TFIDF- Baseline Evaluation
Classifiers:  LogisticRregression Has a training score of 79.0 % accuracy score
Classifiers:  KNearestNeighbors Has a training score of 65.0 % accuracy score
Classifiers:  DecisionTree Has a training score of 69.0 % accuracy score
Classifiers:  SupportVectorClassifier Has a training score of 80.0 % accuracy score


In [78]:
final_outcomes

Unnamed: 0,Model,Training Acc,Validation Acc
0,"(LogisticRregression, LogisticRegression(max_i...",0.792251,0.705062
1,"(KNearestNeighbors, KNeighborsClassifier())",0.652051,0.488895
2,"(DecisionTree, DecisionTreeClassifier(criterio...",0.693833,0.623192
3,"(SupportVectorClassifier, SVC(kernel='sigmoid'))",0.799485,0.744835


In [91]:
clf = make_pipeline(StandardScaler(with_mean=False), SVC(gamma='auto', kernel='sigmoid', C=1))
clf.fit(train_x, train_y)
pred = clf.predict(test_x)

acc = accuracy_score(test_y, pred)


In [92]:
result = classification_report(test_y, pred)
print(result)

              precision    recall  f1-score   support

           0       0.69      0.60      0.64       480
           1       0.87      0.75      0.80        88
           2       0.79      0.92      0.85      1573
           3       0.83      0.45      0.58        67
           4       0.73      0.68      0.70       112
           5       0.83      0.80      0.81       942
           6       0.76      0.63      0.69       316
           7       0.83      0.68      0.75       134
           8       0.86      0.63      0.72       147
           9       1.00      0.85      0.92        13

    accuracy                           0.79      3872
   macro avg       0.82      0.70      0.75      3872
weighted avg       0.79      0.79      0.78      3872



In [93]:
acc

0.7902892561983471