In [1]:
import pandas as pd

In [9]:
horror_train_data=pd.read_csv('./Data/horror-train.csv')

In [10]:
horror_train_data.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [11]:
horror_test_data=pd.read_csv('./Data/horror-test.csv')

In [13]:
horror_train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19579 entries, 0 to 19578
Data columns (total 3 columns):
id        19579 non-null object
text      19579 non-null object
author    19579 non-null object
dtypes: object(3)
memory usage: 459.0+ KB


In [14]:
horror_train_data=horror_train_data[['text','author']]

In [15]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

In [16]:
pipelines=[]
for model in [LogisticRegression(),MultinomialNB(),LinearSVC()]:
    pipeline=make_pipeline(
            CountVectorizer(stop_words='english'),
            TfidfTransformer(),
            model)
    pipelines.append(pipeline)

In [19]:
from sklearn.model_selection import train_test_split
trainx,testx,trainy,testy=train_test_split(horror_train_data.text,horror_train_data.author)

In [23]:
for pipeline in pipelines:
    pipeline.fit(trainx,trainy)
    print(pipeline.score(testx,testy))

0.7946884576098059
0.8157303370786517
0.8034729315628192


In [24]:
horror_test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8392 entries, 0 to 8391
Data columns (total 2 columns):
id      8392 non-null object
text    8392 non-null object
dtypes: object(2)
memory usage: 131.2+ KB


In [25]:
results=[]
for pipeline in pipelines:
    result=pipeline.predict(horror_test_data.text)
    results.append(result)

In [26]:
results

[array(['MWS', 'EAP', 'HPL', ..., 'EAP', 'MWS', 'EAP'], dtype=object),
 array(['MWS', 'EAP', 'EAP', ..., 'EAP', 'MWS', 'HPL'], dtype='<U3'),
 array(['MWS', 'EAP', 'EAP', ..., 'EAP', 'MWS', 'HPL'], dtype=object)]

In [30]:
pipelines[0].steps[0][1].transform(horror_test_data.text)

<8392x22181 sparse matrix of type '<class 'numpy.int64'>'
	with 88807 stored elements in Compressed Sparse Row format>

In [32]:
#catching transformers
from sklearn.model_selection import GridSearchCV
svc_pipe=make_pipeline(
                CountVectorizer(stop_words='english'),
                TfidfTransformer(),
                LinearSVC())

In [33]:
svc_pipe

Pipeline(memory=None,
     steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [34]:
svc_pipe.steps

[('countvectorizer',
  CountVectorizer(analyzer='word', binary=False, decode_error='strict',
          dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
          lowercase=True, max_df=1.0, max_features=None, min_df=1,
          ngram_range=(1, 1), preprocessor=None, stop_words='english',
          strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
          tokenizer=None, vocabulary=None)),
 ('tfidftransformer',
  TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)),
 ('linearsvc',
  LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
       intercept_scaling=1, loss='squared_hinge', max_iter=1000,
       multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
       verbose=0))]

In [35]:
import numpy as np
params={
    'LinearSVC__C':list(np.logspace(1,20,20))
}

In [37]:
gs=GridSearchCV(svc_pipe,cv=2,param_grid=params)

In [2]:
#column transformer and pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [3]:
titanic_data = pd.read_csv('https://raw.githubusercontent.com/zekelabs/data-science-complete-tutorial/master/Data/titanic-train.csv.txt', index_col='PassengerId')

In [4]:
titanic_data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
num_cols=['Age','Fare']
cat_cols=['Embarked','Sex','Pclass']

In [9]:
pipeline_num = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaling',StandardScaler())
])

In [10]:
pipeline_cat = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoding', OneHotEncoder(handle_unknown='ignore'))
])

In [11]:
preprocessor=ColumnTransformer(
            transformers=[
                ('num',pipeline_num,num_cols),
                ('cat',pipeline_cat,cat_cols)
            ])

In [13]:
pipeline=Pipeline(steps=[('preprocessor',preprocessor),
                        ('classifier',RandomForestClassifier(n_estimators=10))])

In [14]:
from sklearn.ensemble import RandomForestClassifier
X=titanic_data.drop('Survived',axis=1)

In [17]:
Y=titanic_data.Survived
from sklearn.model_selection import train_test_split

In [18]:
trainx,testx,trainy,testy=train_test_split(X,Y)

In [19]:
pipeline.fit(trainx,trainy)

Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbo...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [20]:
pipeline.score(testx,testy)

0.8385650224215246

In [21]:
#grid search for pipeline
pipeline.steps

[('preprocessor',
  ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
           transformer_weights=None,
           transformers=[('num', Pipeline(memory=None,
       steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
         strategy='median', verbose=0)), ('scaling', StandardScaler(copy=True, with_mean=True, with_std=True))]), ['Age', 'Fare']), ('cat', Pipeline(memory=None,
       steps...4'>, handle_unknown='ignore',
         n_values=None, sparse=True))]), ['Embarked', 'Sex', 'Pclass'])])),
 ('classifier',
  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=None, max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
              oob_score=False, random_state=None, verbose=0,
              warm_start

In [22]:
param_grid={
    'preprocessor__num__imputer__strategy':['mean','median'],
    'classifier__n_estimators':[10,15,20]
}

In [23]:
from sklearn.model_selection import GridSearchCV

In [24]:
grid_search=GridSearchCV(pipeline,param_grid,cv=5,iid=False)
grid_search.fit(trainx,trainy)
print(grid_search.score(testx,testy))

0.8340807174887892


In [25]:
grid_search.best_params_

{'classifier__n_estimators': 20,
 'preprocessor__num__imputer__strategy': 'median'}