# Pickle

**The purpose of this notebook is to export our model to a pickle file, so we can use the file to build our final product.**

In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, recall_score, f1_score

In [2]:
df = pd.read_csv("../Data/twitter_target.csv")

In [3]:
df.shape

(18990, 6)

## Final Model - Bagging Classifier with decreased dataset

In [4]:
# Assign X and y
X = df[['tweet']]
y = df['target']

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 19, stratify = y)

dropping_list = np.random.choice(list(df[df['target'] == 0].index), 17000, replace = False)

df_decrease_class_0 = df.drop(dropping_list)

df_decrease_class_0['target'].value_counts()

df_decrease_class_0['target'].value_counts(normalize = True)

# Assign X and y
X = df_decrease_class_0[['tweet']]
y = df_decrease_class_0['target']

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 19, stratify = y)

In [5]:
pipe_bag = Pipeline([
    ('cvec', CountVectorizer(stop_words = 'english')),
    ('bag', BaggingClassifier())
])

params_pipe_bag = {
    'cvec__min_df': [2, 3],
    'cvec__ngram_range': [(1,1), (1,2)],
    'bag__n_estimators': [8, 10]
}

gs_pipe_bag = GridSearchCV(pipe_bag, params_pipe_bag, cv = 5)
gs_pipe_bag.fit(X_train['tweet'], y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                            

In [6]:
print(f'Best params: {gs_pipe_bag.best_params_}')
print("------------------------------------")
print(f'Training score: {gs_pipe_bag.score(X_train["tweet"], y_train)}')
print(f'Testing score: {gs_pipe_bag.score(X_test["tweet"], y_test)}')
print(f'ROC AUC score: {roc_auc_score(y_test, gs_pipe_bag.predict(X_test["tweet"]))}')
print(f'Recall score: {recall_score(y_test, gs_pipe_bag.predict(X_test["tweet"]))}')
print(f'F1 score: {f1_score(y_test, gs_pipe_bag.predict(X_test["tweet"]))}')

Best params: {'bag__n_estimators': 10, 'cvec__min_df': 2, 'cvec__ngram_range': (1, 2)}
------------------------------------
Training score: 0.9691689008042895
Testing score: 0.6746987951807228
ROC AUC score: 0.6702354767757795
Recall score: 0.5847457627118644
F1 score: 0.6301369863013699


In [7]:
# put the two functions above together, using 'write binary' permissions

pickle.dump(gs_pipe_bag, open('../Files/final_model.p', 'wb+'))

**Check our work**

Let's read in our model and check the score/coefficients.
- `pickle.load(file)`: de-serializes the stored object back into a variable

In [8]:
# use the above function with open() and 'read binary' permissions to get our model back
model_from_pickle = pickle.load(open('../Files/final_model.p', 'rb'))

In [9]:
print(model_from_pickle.score(X_test['tweet'], y_test))

0.6746987951807228


In [10]:
model_from_pickle.predict_proba(["power outage is happening, so is blackouts weather is so bad"])

array([[0.64965368, 0.35034632]])

**Testing our model:**

In [11]:
some_tweet = ['This tweet could be a aggregation of all tweets on twitter mentioniing about blackouts']
print(model_from_pickle.predict(some_tweet))
print(model_from_pickle.predict_proba(some_tweet))

[1]
[[0.4 0.6]]
