In [96]:
import pandas as pd
import numpy as np
from sklearn import model_selection, preprocessing
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import RidgeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split,GridSearchCV

In [92]:
#Preparing the models
def getModels():
  models = {}

  # Logistic Regression
  from sklearn.linear_model import LogisticRegression
  models['Logistic Regression'] = LogisticRegression()

  # Support Vector Machines
  from sklearn.svm import LinearSVC
  models['Support Vector Machines'] = LinearSVC()

  # Decision Trees
  from sklearn.tree import DecisionTreeClassifier
  models['Decision Trees'] = DecisionTreeClassifier()

  # Random Forest
  from sklearn.ensemble import RandomForestClassifier
  models['Random Forest'] = RandomForestClassifier()

  # Naive Bayes
  from sklearn.naive_bayes import GaussianNB
  models['Naive Bayes'] = GaussianNB()

  # K-Nearest Neighbors
  from sklearn.neighbors import KNeighborsClassifier
  models['K-Nearest Neighbor'] = KNeighborsClassifier()
  return models

In [89]:
def formatmodels(X_train,y_train,X_test,y_test):
  models = getModels()
  accuracy, precision, recall = {}, {}, {}
  for key in models.keys():
    models[key].fit(X_train,y_train)
    predictions = models[key].predict(X_test)
    accuracy[key] = accuracy_score(predictions, y_test)
    precision[key] = precision_score(predictions, y_test)
    recall[key] = recall_score(predictions, y_test)
  df_model = pd.DataFrame(index=models.keys(), columns=['Accuracy', 'Precision', 'Recall'])
  df_model['Accuracy'] = accuracy.values()
  df_model['Precision'] = precision.values()
  df_model['Recall'] = recall.values()
  print(df_model)

In [22]:
class DataFrameSelector(BaseEstimator,TransformerMixin):
  def __init__(self,attribute_names):
    self.attribute_names = attribute_names
  def fit(self,X,y=None):
    return self
  def transform(self,X):
    return X[self.attribute_names].values

In [67]:
class ArrayVectorizer(BaseEstimator,TransformerMixin):
  def __init__(self,count_vectorizer,fitted = False):
    self.count_vectorizer = count_vectorizer
    self.fitted = fitted
  def fit(self,X):
    return self
  def transform(self,X):
    X = [a[0] for a in X]
    if self.fitted == False:
      X = self.count_vectorizer.fit_transform(X)
      self.fitted = True
    else:
      X = self.count_vectorizer.transform(X)
    return X

In [24]:
class Debugger(BaseEstimator, TransformerMixin):
    def transform(self, data):
      print("Shape of Pre-processed Data:", data.shape)
      print(pd.DataFrame(data).head())
      return data

    def fit(self, data, y=None, **fit_params):
      return self

In [101]:
train_file = "train.csv"
test_file = "test.csv"

df_train = pd.read_csv(train_file)
df_test = pd.read_csv(test_file)


target = df_train["target"]
id = df_test["id"]
num_attribs = []
text_attribs = ["text"]
count_vectorizer = CountVectorizer()
num_pipeline = Pipeline([
    ('selector',DataFrameSelector(num_attribs)),
])
text_pipeline = Pipeline([
    ('selector',DataFrameSelector(text_attribs)),
    ('vectorizer',ArrayVectorizer(count_vectorizer)),
])
full_pipeline = FeatureUnion(transformer_list=[
    ('text_pipeline',text_pipeline),
    ('num_pipeline',num_pipeline)
])
df_train = full_pipeline.fit_transform(df_train)
df_test = full_pipeline.transform(df_test)

X_train,X_test,y_train,y_test = train_test_split(df_train,target,test_size=0.2,random_state = 42)
clf = RidgeClassifier()
clf.fit(df_train,target)
scores = model_selection.cross_val_score(clf, df_train, target, cv=3, scoring="f1")
predictions = clf.predict(df_test)
output = pd.concat([id,pd.DataFrame(predictions,columns = ["target"])],axis = 1)
output.to_csv("output.csv",index=False)


[0.59453669 0.56498283 0.64082434]
