In [None]:
import zipfile as zf
files = zf.ZipFile("datasets.zip", 'r')
files.extractall('.')
files.close()

In [None]:
from pathlib import Path

zip_path = Path("datasets.zip")
zf.ZipFile(zip_path,'r')

# Loading categorical text data 

In [None]:
from sklearn.datasets import load_files
from pathlib import Path

folder_path = Path('./datasets/text_contents')

load_files_test = load_files(folder_path)

In [None]:
load_files_test

In [None]:
type(load_files_test)

In [None]:
load_files_test.target_names

In [None]:
load_files_test.data

In [None]:
load_files_test.filenames

# Bag of Words Feature Extraction

Reference : https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

In [None]:
# import the library tha will help us create the bag of words representation of a document. 

from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer()

In [None]:
vectorizer.get_params

In [None]:
vectorizer.build_analyzer()("This is a python DS class!!, is it ??")

In [None]:
type(vectorizer)

In [None]:
vectorizer.get_feature_names()

In [None]:
## Load our three documents and use fit to create the list of our bag of words
vectorizer.fit(load_files_test.data)

In [None]:
features = vectorizer.get_feature_names()

In [None]:
type(features)

In [None]:
len(features)

In [None]:
features

In [None]:
bag_of_words = vectorizer.transform(load_files_test.data)

In [None]:
type(load_files_test.data[0])

In [None]:
type(bag_of_words)

### Compressed Sparse Row Matrix

https://en.wikipedia.org/wiki/Sparse_matrix 

In [None]:
bag_of_words.shape

In [None]:
bag_of_words[0]

In [None]:
type(bag_of_words[0])

In [None]:
bag_of_words[0].shape

In [None]:
bag_of_words[0].size

In [None]:
print(bag_of_words[0])

In [None]:
bag_of_words[0].toarray()

In [None]:
type(vectorizer.vocabulary_)

In [None]:
vectorizer.vocabulary_

In [None]:
len(vectorizer.vocabulary_)

## Tf-Idf

https://en.wikipedia.org/wiki/Tf%E2%80%93idf

# Pipeline & Grid Search

In [None]:
import pandas as pd 
import string

from pprint import pprint

from time import time

import os

## Load the data

For this example we use the SMS spam collection data set from UCI machine learning lab.

https://archive.ics.uci.edu/ml/datasets/sms+spam+collection

In [None]:
file_path = os.path.relpath('./datasets/SMSSpamCollection.csv')

In [None]:
file_path

In [None]:
sms_spam_collection_df = pd.read_csv(file_path,sep='\t',names=['target','message'])

In [None]:
type(sms_spam_collection_df)

## Explore the data

In [None]:
sms_spam_collection_df.head()

In [None]:
sms_spam_collection_df.describe()

## Create train and test splits

In [None]:
from sklearn.model_selection import train_test_split

message_train,message_test,target_train,target_test = train_test_split(sms_spam_collection_df['message'],sms_spam_collection_df['target'])

In [None]:
print(message_train.shape,message_test.shape)

In [None]:
type(target_train)

- [ ] **30 sec: Explore the train and test splits**

## Build our Model 

**The model in this case requires a pipeline, because the input is text and it cannot be directly fed into our model.**

The pipeline will consist of a CountVectorizer --> TfidfTrasnformer --> Classifier. 

The classifier we will choose for this exercise is SGD Classifer

In [None]:
# import vectorizer 

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import TfidfTransformer

#import SGD classifier 

from sklearn.linear_model import SGDClassifier


#### Build the desired pipeline

In [None]:
# create our data processing pipeline

from sklearn.pipeline import Pipeline

pipeline = Pipeline([
                     ('vectorizer',CountVectorizer()),
                     ('tf-idf',TfidfTransformer()),
                     ('sgd_classifier',SGDClassifier()),
                    ])



## Train the Model 

### Hyper parameter search using GridSearch 

https://scikit-learn.org/stable/modules/grid_search.html#grid-search

https://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html#sphx-glr-auto-examples-model-selection-grid-search-text-feature-extraction-py

In [None]:
#import grid serach 

from sklearn.model_selection import GridSearchCV

In [None]:
pipeline

In [None]:
pipeline.steps

In [None]:
type(pipeline.steps[1][1])

In [None]:
tfidf_transformer = pipeline.steps[1][1]

In [None]:
## Lets check out all the hyper parameters available 
tfidf_transformer.get_params()

In [None]:
## We choose the use_idf parameter to tune 
grid_search_params = {'tf-idf__use_idf':(True,False),
                     'sgd_classifier__max_iter': (10, 50, 80,1000),
                      'vectorizer__max_features':(None,10,50,1000)
                     }

In [None]:
grid_search = GridSearchCV(pipeline,grid_search_params,n_jobs=-1,verbose=1,cv=5)

In [None]:
type(grid_search)

In [None]:
print("started grid search")

print("parameters are :")

pprint(grid_search_params)

In [None]:
start_time = time()

grid_search.fit(message_train,target_train)
print('doine in %f secdons' %(time()-start_time))

In [None]:
grid_search_params

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

## Test 

In [None]:
pred_target = grid_search.predict(message_test)

In [None]:
pred_target

In [None]:
result = (pred_target == target_test)

In [None]:
result.values

In [None]:
## find the accurary from this 

accuracy = (result.value_counts()[True] / result.count())

accuracy

In [None]:
result.value_counts()[True]

In [None]:
result.value_counts()[False]