In [1]:
import sklearn
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer

  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
scikit_learn_version = sklearn.__version__
scikit_learn_version

'0.21.3'

In [3]:
sentiment_data = pd.read_csv('datasets/sentimental_analysis_data.csv',
                            header=None,
                            names=['Label', 'Text'],
                            sep='\t')
sentiment_data.sample(10)

Unnamed: 0,Label,Text
4363,0,i heard da vinci code sucked soo much only 2.5...
2827,1,I love Harry Potter..
5361,0,I think I hate Harry Potter because it outshin...
135,1,i really loved the da vinci code.
1000,1,And I love the way our three-generation Missio...
6782,0,"Oh, and Brokeback Mountain is a TERRIBLE movie..."
5726,0,"I hate Harry Potter, that daniel wotshisface n..."
1605,1,the last stand and Mission Impossible 3 both w...
4042,0,the Da Vinci Code sucks.
3101,1,", and i love brokeback mountain..."


In [4]:
sentiment_data.shape

(6918, 2)

In [5]:
X = sentiment_data['Text']
Y = sentiment_data['Label']

In [6]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [7]:
x_train.shape, x_test.shape

((5534,), (1384,))

In [8]:
y_train.shape, y_test.shape

((5534,), (1384,))

In [9]:
tfidf_vect = TfidfVectorizer(max_features=15)
x_trans = tfidf_vect.fit_transform(x_train)

In [10]:
print(x_trans[0:3])

  (0, 12)	0.4410068728287687
  (0, 0)	0.5382109924028984
  (0, 11)	0.5078591663908792
  (0, 5)	0.5078591663908792
  (1, 10)	0.5637528502571348
  (1, 2)	0.5637528502571348
  (1, 0)	0.6036269109755735
  (2, 10)	0.7071067811865476
  (2, 2)	0.7071067811865476


In [11]:
x_trans.shape

(5534, 15)

In [12]:
classifier = LinearSVC(C=1.0, max_iter=1000, tol=1e-3)
linear_svc_model = classifier.fit(x_trans, y_train)

linear_svc_model

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.001,
          verbose=0)

In [13]:
x_test_trans = tfidf_vect.fit_transform(x_test)

In [14]:
x_test_trans.shape

(1384, 15)

In [15]:
y_pred = linear_svc_model.predict(x_test_trans)
y_pred

array([0, 0, 1, ..., 1, 1, 0], dtype=int64)

In [16]:
pred_results = pd.DataFrame({'y_test': y_test,
                            'y_pred': y_pred})
pred_results.sample(10)

Unnamed: 0,y_test,y_pred
5774,0,0
6744,0,0
2713,1,1
6887,0,0
5229,0,0
2778,1,1
3736,1,1
6094,0,0
5018,0,1
752,1,1


In [17]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8908959537572254

In [18]:
text_clf_param = {}

text_clf_param['preprocessing'] = tfidf_vect
text_clf_param['model'] = linear_svc_model
text_clf_param['sklearn_version'] = scikit_learn_version
text_clf_param['accuracy'] = accuracy

In [19]:
text_clf_param

{'preprocessing': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                 dtype=<class 'numpy.float64'>, encoding='utf-8',
                 input='content', lowercase=True, max_df=1.0, max_features=15,
                 min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                 smooth_idf=True, stop_words=None, strip_accents=None,
                 sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                 tokenizer=None, use_idf=True, vocabulary=None),
 'model': LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
           intercept_scaling=1, loss='squared_hinge', max_iter=1000,
           multi_class='ovr', penalty='l2', random_state=None, tol=0.001,
           verbose=0),
 'sklearn_version': '0.21.3',
 'accuracy': 0.8908959537572254}

In [20]:
import joblib

In [21]:
filename = 'models/text_clf_checkpoint.joblib'

joblib.dump(text_clf_param, filename)

['models/text_clf_checkpoint.joblib']

In [22]:
clf_checkpoint = joblib.load(filename)

In [23]:
reloaded_vect = clf_checkpoint['preprocessing']
reloaded_vect

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=15,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [24]:
clf_model = clf_checkpoint['model']
clf_model

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.001,
          verbose=0)

In [26]:
x_test_trans_new = reloaded_vect.fit_transform(x_test)
y_pred = clf_model.predict(x_test_trans_new)
y_pred

array([0, 0, 1, ..., 1, 1, 0], dtype=int64)

In [27]:
accuracy_score(y_test, y_pred)

0.8908959537572254

In [28]:
clf_checkpoint['accuracy']

0.8908959537572254

# Serializing Pipelines

In [29]:
from sklearn.pipeline import Pipeline

In [31]:
clf_pipeline = Pipeline(steps=[('tfidf_vect', tfidf_vect), ('classifier', classifier)])
pipeline_model = clf_pipeline.fit(x_train, y_train)

In [32]:
y_pred = pipeline_model.predict(x_test)

In [33]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8916184971098265

In [34]:
pipeline_clf_param = {}

pipeline_clf_param['pipeline_clf'] = pipeline_model
pipeline_clf_param['sklearn_version'] = scikit_learn_version
pipeline_clf_param['accuracy'] = accuracy

In [35]:
filename = 'models/pipe_clf_checkpoint.joblib'

In [36]:
joblib.dump(pipeline_clf_param, filename)

['models/pipe_clf_checkpoint.joblib']

In [37]:
pipeline_clf_checkpoint = joblib.load(filename)

In [38]:
reloaded_pipeline = pipeline_clf_checkpoint['pipeline_clf']
reloaded_pipeline

Pipeline(memory=None,
         steps=[('tfidf_vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=15,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('classifier',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
     

In [39]:
y_pred = reloaded_pipeline.predict(x_test)

In [40]:
accuracy_score(y_test, y_pred)

0.8916184971098265

In [41]:
pipeline_clf_checkpoint['accuracy']

0.8916184971098265