In [2]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np   
import pandas as pd  

import seaborn as sns
import matplotlib.pyplot as plt

import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import BaggingClassifier,RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
df = pd.read_parquet('../../src/components/test.parquet')

In [None]:
df = df.head(100)

In [None]:
X_train, X_test, y_train, y_test = train_test_split( df['Consumer complaint narrative'],  df['category_id'], 
                                                                test_size=0.25, 
                                                               random_state=42,stratify= df['category_id'])

In [None]:
X_train.shape,y_train.shape

In [None]:
tfidf = TfidfVectorizer(sublinear_tf=True, 
                        max_df=0.75, 
                        min_df=0.25, 
                        ngram_range=(1, 3),
                        stop_words='english')

In [None]:
import timeit
start = timeit.default_timer()

# We transform each complaint into a vector
features = tfidf.fit_transform(X_train)

stop = timeit.default_timer()
print('Time: ', stop - start)  

In [None]:
features.toarray().to_parquet('u.parquet')


In [None]:
pd.DataFrame(features)

In [None]:
pd.DataFrame(y_test)

In [None]:
features.shape

In [None]:
preprocessor = np.load('../../artifacts/preprocessor/preprocessor.npz',allow_pickle=True)

In [None]:
preprocessor['arr_0']

In [None]:
X_train, X_test, y_train, y_test = train_test_split( features,  labels, 
                                                                test_size=0.001, 
                                                               random_state=42,stratify=labels)

In [None]:
y_test.value_counts()

In [None]:
X_test.shape

In [None]:
from sklearn.model_selection import KFold
pipelines = []
pipelines.append(('LogisticRegression' , (Pipeline([('LR' ,LogisticRegression())]))))
pipelines.append(('SVC' , (Pipeline([('SVC' ,LinearSVC())]))))
pipelines.append(('MultinomailNB' , (Pipeline([('MNB' ,MultinomialNB())]))))
pipelines.append(('RandomForest' , (Pipeline([('RF' ,RandomForestClassifier())]))))
pipelines.append(('XGBoost' , (Pipeline([('XGB' ,XGBClassifier())]))))


In [None]:
pipelines

In [None]:
X_test.shape,y_test.shape,X_train.shape, y_train.shape

In [None]:
y_test

In [None]:
y_train.shape

In [None]:
model_name = []
results = []
for pipe ,model in pipelines:
    kfold = KFold(n_splits=2)
    cross_validation_results = cross_val_score(model , features , y_test ,cv =kfold , scoring='balanced_accuracy')
    results.append(cross_validation_results)
    model_name.append(pipe)
    msg = "%s: 'mean accuracy :' %f  'std deviation :' (%f)"  \
          % (model[0],  cross_validation_results.mean(),  cross_validation_results.std())

    print(msg)

In [None]:
results_mean = np.mean(np.array(results),axis=1)
baseline_models = pd.DataFrame(list(zip(model_name,results_mean)), columns=['Model','Accuracy'])

baseline_models

In [None]:
df['Consumer complaint narrative'].drop_duplicates().shape

In [None]:
df

In [None]:
X_test,y_test

In [None]:
y_test.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split( df['Consumer complaint narrative'],  df['category_id'], 
                                                                test_size=0.1, 
                                                               random_state=42,stratify= df['category_id'])

In [None]:
X_test.shape

In [None]:
y_test.value_counts()

In [None]:
from sklearn.multiclass import OneVsRestClassifier


In [None]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', OneVsRestClassifier(LinearSVC())),
])
parameters = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__min_df': (0.25,0.5,0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
#    'clf__estimator__alpha': (1e-2, 1e-3)
}

grid_search_tune = GridSearchCV(pipeline, parameters, cv=2, n_jobs=2, verbose=3)
grid_search_tune.fit(X_test, y_test)

print("Best parameters set:")

In [None]:
grid_search_tune.best_estimator_.steps

In [None]:
X_train = pd.read_parquet('../../data/preprocessed/X_train.parquet')
X_test = pd.read_parquet('../../data/preprocessed/X_test.parquet')
y_train = pd.read_parquet('../../data/preprocessed/y_train.parquet')
y_test = pd.read_parquet('../../data/preprocessed/y_test.parquet')

In [None]:
X_train.shape, X_test.shape

In [None]:
y_train.shape, y_test.shape

In [1]:
import scipy.sparse
sparse_matrix = scipy.sparse.load_npz('../../data/preprocessed/X_train.npz')

In [None]:
sparse_matrix

In [None]:
import pandas as pd
pd.SparseDataFrame(sparse_matrix)

In [None]:
import h5py

hf = h5py.File('data.h5', 'w')

In [None]:
hf.create_dataset('dataset_1', data=sparse_matrix)


In [None]:
import scipy.sparse as ss
import h5sparse
import numpy as np

In [None]:
sparse_matrix = ss.csr_matrix([[0, 1, 0],
                                 [0, 0, 1],
                                 [0, 0, 0],
                                [1, 1, 0]],
                              dtype=np.float64)

In [None]:
hf = h5py.File('test.h5', 'w')


In [None]:
with h5sparse.File("test.h5") as h5f:
      h5f.create_dataset('sparse/matrix', data=sparse_matrix)

In [None]:
hf = h5py.File('test.h5', 'r')


In [None]:
hf.keys()

In [None]:
n1 = hf.get('sparse/matrix')

In [None]:
n1

In [None]:
n1 = np.array(n1)

In [None]:
n1

In [None]:
h5f = h5sparse.File("test.h5")

In [None]:
h5f['sparse/matrix'][:]

In [None]:
sparse_matrix

In [2]:
import gzip
import numpy

f = gzip.GzipFile("my_array.npy.gz", "w")
numpy.save(file=f, arr=sparse_matrix)
f.close()

In [7]:
import yaml

In [8]:
    path_to_yaml = "../../config.yaml"
    try:
        with open(path_to_yaml, "r") as file:
            config = yaml.safe_load(file)
    except Exception as e:
        print("Error reading the config file")


In [9]:
import pandas as pd

In [10]:
    # load training data
    X_train = scipy.sparse.load_npz('../../' + config['model_trainer']['X_train_path'])
    y_train = pd.read_parquet( '../../' + config['model_trainer']['y_train_path']).reset_index(drop=True)


In [13]:
scipy.sparse.load_npz('/Users/piyush/Desktop/dsml_Portfolio/consumer_complaint/data/preprocessed/X_train.npz').shape

(692411, 891120)

In [44]:
y_train.shape

(692411, 1)

In [14]:
X_train

<100x891120 sparse matrix of type '<class 'numpy.float64'>'
	with 23869 stored elements in Compressed Sparse Row format>

In [16]:
from sklearn.svm import LinearSVC

In [40]:
model = LinearSVC()
model.fit(X_train, y_train.values.ravel())
print(model.score(X_train,y_train))

1.0


In [21]:
X_train.shape

(100, 891120)

In [22]:
y_train.values.shape

(100, 1)

In [28]:
list(y_train)

['complaint_category_id']

In [4]:
import scipy

In [38]:
y_train['complaint_category_id'].values

array([1, 3, 6, 4, 0, 0, 2, 2, 2, 1, 4, 0, 4, 0, 0, 0, 4, 1, 0, 1, 0, 0,
       1, 1, 5, 0, 1, 6, 0, 1, 2, 3, 0, 0, 0, 4, 1, 1, 0, 0, 0, 1, 2, 0,
       2, 1, 0, 0, 3, 4, 0, 0, 6, 0, 2, 5, 3, 5, 0, 4, 1, 5, 2, 2, 0, 1,
       2, 2, 0, 2, 1, 0, 1, 0, 0, 5, 0, 0, 0, 0, 2, 2, 0, 0, 0, 1, 3, 1,
       0, 4, 0, 0, 2, 5, 1, 1, 6, 5, 2, 0])

In [5]:
    X_train = scipy.sparse.load_npz(config['model_trainer']['X_train_path'])
    y_train = pd.read_parquet(config['model_trainer']['y_train_path']).reset_index(drop=True)

NameError: name 'config' is not defined

In [45]:
    model = LinearSVC()
    model.fit(X_train, y_train.values.ravel())

In [46]:
model.score(X_train, y_train.values.ravel())

0.9755434272419127

In [53]:
X_test = scipy.sparse.load_npz('../../data/preprocessed/X_test.npz')
y_test = pd.read_parquet('/Users/piyush/Desktop/dsml_Portfolio/consumer_complaint/data/preprocessed/y_test.parquet').reset_index(drop=True)

In [54]:
X_test.shape

(230804, 891120)

In [55]:
y_test.shape

(230804, 1)

In [69]:
y_pred = model.predict(X_test)

In [70]:
y_pred

array([0, 4, 0, ..., 0, 0, 0])

In [71]:
y_test.values.ravel()

array([0, 7, 0, ..., 0, 0, 0])

In [72]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score

In [73]:
accuracy_score(y_test, y_pred)

0.8684684840817317

In [74]:
balanced_accuracy_score(y_test,y_pred)

0.6844094737730855

In [66]:
aa = model.predict(X_train)

In [67]:
accuracy_score(y_train.values.ravel(), aa)

0.9755434272419127

In [68]:
balanced_accuracy_score(y_train.values.ravel(), aa)

0.9716694934674782

In [1]:
X_test.shape

NameError: name 'X_test' is not defined