# Machine Learning Models

In [3]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import time
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import model_selection, svm , naive_bayes
from sklearn.naive_bayes import ComplementNB , BernoulliNB,GaussianNB,MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score,balanced_accuracy_score,average_precision_score,f1_score,recall_score,classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import pickle

In [None]:
#load Dataset
dataset=pd.read_csv('dataset.csv', encoding='utf-8')
dataset["text"].values.astype('str')

In [None]:
# Split the dataset
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(dataset["text"].values.astype('str'),dataset["target"],random_state=1,test_size=0.20)
# Feature extraction
vectorizer = TfidfVectorizer(max_features=20000,decode_error="ignore")
Train_X= vectorizer.fit_transform(Train_X)
Test_X = vectorizer.transform(Test_X)

# Classifier
clf=LogisticRegression(C=0.1)

start = time.time()
# fit the training dataset on the classifier
clf.fit(Train_X, Train_Y)

end = time.time()
# Print the elapsed time for training the model
print("The elapsed time ", end - start)

# predict the labels on test dataset
clf_predictions=clf.predict(Test_X)

# Use accuracy_score function to get the accuracy
print("f1 Score -> ",f1_score(Test_Y,clf_predictions)*100)
print("recall Score -> ",recall_score(Test_Y,clf_predictions)*100)
print("balanced Accuracy Score -> ",balanced_accuracy_score(Test_Y,clf_predictions)*100)
print("precision Score -> ",average_precision_score(Test_Y,clf_predictions)*100)
print("Accuracy Score -> ",accuracy_score(Test_Y,clf_predictions)*100)


In [13]:
filename = 'finalized_model.pkl'
pickle.dump(clf, open(filename, 'wb'))

In [14]:

with open(filename, 'rb') as file:  
    Pickled_LR_Model = pickle.load(file)

Pickled_LR_Model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
# Use the Reloaded Model to 
# Calculate the accuracy score and predict target values

# Calculate the Score 
score = Pickled_LR_Model.score(Test_X, Test_Y)  
# Print the Score
print("Test score: {0:.2f} %".format(100 * score))  

# Predict the Labels using the reloaded Model
Ypredict = Pickled_LR_Model.predict(Test_X)  

Ypredict

Test score: 84.17 %


array([0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0],
      dtype=int64)

In [17]:
sample_sub=pd.read_csv('finalUnlabelled.csv', encoding='utf-8')
Final = vectorizer.transform(sample_sub['text'])

y_pre=Pickled_LR_Model.predict(Final)
sub=pd.DataFrame({'text':sample_sub['text'].values.tolist(),'target':y_pre})
sub.to_csv('LOG-TFIDF.csv',index=False)