# Text Model

Importing libraries and train data from google drive

In [16]:
import pandas as pd
import pickle
import numpy as np


from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report,confusion_matrix, plot_confusion_matrix
from imblearn.over_sampling import *


"""
import math
from datetime import datetime

import seaborn as sns
import matplotlib.pyplot as plt
"""
"""
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

"""


'\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.neural_network import MLPClassifier\n\n'

In [22]:
import os
os.system("wget https://raw.githubusercontent.com/virufy/covid/master/data/labels.csv")

0

## **Main class**

In [18]:
class Text():
  """
  This is a class for training , preprcoessing data and testing the text model data using testData provided
  """

  """
  def __init__(self, trainData, testData):
    self.trainData = trainData
    self.testData = testData
  """

  def pre_processing(self,in_file):
    """
    Preprocess the input file to the standard format. 
    Parameter:
      inFile: string containing the input file name in .csv format
    Returns: target labels value (in 1 or 0) and preprocessed dataframe 
    """
    labels_f = in_file
    labels_df = pd.read_csv(labels_f)
    #dropping the columns unrealted to the text model
    labels_df = labels_df.drop(columns=["date", "cough_filename"])

    #removing the columns which does not contain the information on covid tests
    labels_df["corona_test"].fillna("None", inplace = True) 
    labels_df = labels_df.drop(labels_df[labels_df.corona_test == "None"].index)

    #replacing the empty values with None
    labels_df ["smoker"].fillna("None", inplace = True) 
    labels_df ["patient_reported_symptoms"].fillna("None", inplace = True)
    labels_df ['age'].fillna("None", inplace = True)
    labels_df ['gender'].fillna("None", inplace = True)
    labels_df ['medical_history'].fillna("None", inplace = True)
    labels_df.dropna()

    #CORONA_TEST
    newdf= labels_df.replace(to_replace ="negative", 
                    value =0)
    newdf = newdf.replace(to_replace ="positive", 
                    value =1)
    newdf= newdf.replace(to_replace ="FALSE", 
                    value =0)
    newdf = newdf.replace(to_replace ="TRUE", 
                    value =1)

    #AGE
    newdf['age'] = LabelEncoder().fit_transform(newdf['age'])

    #Gender
    newdf['gender'] = newdf['gender'].str.lower()
    newdf['gender'] = LabelEncoder().fit_transform(newdf['gender'])

    #medical_history
    newdf["medical_history"] = newdf["medical_history"].str.lower()
    med_history = ['None,', 'congestive heart failure,','disease or conditions that make it harder to cough,', \
                        'asthma or chronic lung disease,','pregnancy,', 'diabetes with complications,']
        
    for mh in med_history:
        newdf[mh] = newdf.medical_history.str.contains(mh).astype(int)
        newdf["medical_history"] = newdf.medical_history.str.replace(mh+ ",", "")

    newdf = newdf.drop(columns = ["medical_history"])

    #smoker
    newdf['smoker'] = LabelEncoder().fit_transform(newdf['smoker'])

    #symptoms
    newdf['patient_reported_symptoms'] = newdf['patient_reported_symptoms'].str.lower()

    symptoms = ['fever, chills, or sweating,', 'shortness of breath,', \
                      'new or worsening cough,','sore throat,', 'body aches,', \
                      'loss of taste,', 'loss of smell,', 'none,']
    for ps in symptoms:
        newdf[ps] = newdf.patient_reported_symptoms.str.contains(ps).astype(int)
        newdf["patient_reported_symptoms"] = newdf.patient_reported_symptoms.str.replace(ps+ ",", "")

    newdf = newdf.drop(columns = ["patient_reported_symptoms"])

    target_labels = newdf["corona_test"] 
    newdf = newdf.drop(columns = ["corona_test"])

    newdf.drop(newdf.columns[newdf.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)

    return target_labels, newdf


  def prep_data_train(self, train_data):
    """
    Prepare data for text model
    Returns: 
      x_train, y_train is the train data prepared
    """
    target_labels, processed_df = self.pre_processing(train_data)
    x_train_orig, x_test, y_train_orig, y_test = train_test_split(processed_df,target_labels, test_size=0.15, shuffle=True)

    x_train, y_train = SMOTE(sampling_strategy='minority').fit_resample(x_train_orig, y_train_orig)
    return x_train, y_train, x_test, y_test

  def train_model(self, train_data):
    """
    Trains the text model using the trainData 
    Returns:
      a string containing saved model of form .sav
    """
    X_train, y_train, x_test, y_test = self.prep_data_train(train_data)
    clf = SVC(kernel = "linear",C=1, degree = 2, gamma=0.001,random_state=0)
    clf.fit(X_train,y_train)

    print("\nTraining:")
    print("Accuracy: ",end="")
    pred = clf.predict(X_train)
    accuracy = accuracy_score(y_train, pred)
    print(accuracy)
    conf_mat = confusion_matrix(y_train, pred)
    print(conf_mat)
    print("\nTesting:")
    print("Accuracy: ",end="")
    pred = clf.predict(x_test)
    accuracy = accuracy_score(y_test, pred)
    print(accuracy)
    conf_mat = confusion_matrix(y_test, pred)
    print(conf_mat)
    print(classification_report(y_test, pred))
    print("\n\n")

    filename = 'textModelSVC.sav'
    pickle.dump(clf, open(filename, 'wb'))

    return filename
  

  def prep_data_test(self, test_data):
    """
    Prepare data for text model
    Returns: 
      targetLabels, processedDf is the test data prepared for model
    """
    target_labels, processed_df = self.pre_processing(test_data)
    return target_labels, processed_df

  def predict(self, test_data, model_file):
    """
    Predict for covid positive or negative using the saved model 
    Returns: 
      bool value depecting positive and negative covid results 
    """
    y_test, x_test,  = self.prep_data_test(test_data)
    loaded_model = pickle.load(open(model_file, 'rb'))
    pred = loaded_model.predict(x_test)
    print(pred)
    accuracy = accuracy_score(y_test,pred)
    print("Accuracy on test dataset : ", accuracy)
    conf_mat = confusion_matrix(y_test,pred)
    print("Confusion matrix :\n", conf_mat)
    print("Classification report \n", classification_report(y_test, pred))
    return pred

## Run the Model

In [19]:
if __name__ == "__main__":

  train_data = "labels.csv"
  # test_data = ""

  text = Text()

  model_file = text.train_model(train_data)

  # test = text.predict(test_data,model_file)


Training:
Accuracy: 0.9285714285714286
[[7 0]
 [1 6]]

Testing:
Accuracy: 0.6666666666666666
[[1 1]
 [0 1]]
              precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.50      1.00      0.67         1

    accuracy                           0.67         3
   macro avg       0.75      0.75      0.67         3
weighted avg       0.83      0.67      0.67         3






