In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
#Lecture du Dataset
df= pd.read_csv('../data/external/train_2.csv')

In [10]:
#Valeurs nulles
df.isna().sum()

Date received                         0
Product                               0
Sub-product                           0
Issue                                 0
Sub-issue                             0
Consumer complaint narrative          0
Company public response               0
Company                               0
State                                 0
ZIP code                              0
Tags                                  0
Consumer consent provided?            0
Submitted via                         0
Date sent to company                  0
Company response to consumer          0
Timely response?                      0
Consumer disputed?              1197844
Complaint ID                          0
dtype: int64

In [11]:
#Nombre de modalités pour la variable 'Product'
df.Product.value_counts()

Credit reporting, credit repair services, or other personal consumer reports    778990
Debt collection                                                                 210805
Credit reporting or other personal consumer reports                              72657
Mortgage                                                                         59437
Checking or savings account                                                      46699
Credit card or prepaid card                                                      35040
Credit reporting                                                                 21216
Student loan                                                                     21110
Name: Product, dtype: int64

In [12]:
# Balance the data
df_grouped_by = df.groupby(['Product'])
df_balanced = df_grouped_by.apply(lambda x: x.sample(df_grouped_by.size().min()).reset_index(drop=True))
df_balanced = df_balanced.droplevel(['Product'])
df_balanced.head(3)

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,07/19/23,Checking or savings account,Checking account,Managing an account,Deposits and withdrawals,I deposited a {$550.00} USPS money order XX/XX...,Company has responded to the consumer and the ...,"BANK OF AMERICA, NATIONAL ASSOCIATION",GA,30294,Older American,Consent provided,Web,07/19/23,Closed with explanation,Yes,,7273276
1,09/25/23,Checking or savings account,Checking account,Managing an account,Funds not handled or disbursed as instructed,"On XX/XX/2023, My checking account was restric...",,JPMORGAN CHASE & CO.,NV,89074,,Consent provided,Web,09/25/23,Closed with explanation,Yes,,7602001
2,10/23/23,Checking or savings account,Checking account,Managing an account,Problem using a debit or ATM card,On XX/XX/2023 I was in XXXX XXXX my debit card...,Company has responded to the consumer and the ...,"SANTANDER HOLDINGS USA, INC.",NY,10308,,Consent provided,Web,10/23/23,Closed with explanation,Yes,,7744633


In [13]:
#Nombre de modalités après le balancement des données
df_balanced.Product.value_counts()

Checking or savings account                                                     21110
Credit card or prepaid card                                                     21110
Credit reporting                                                                21110
Credit reporting or other personal consumer reports                             21110
Credit reporting, credit repair services, or other personal consumer reports    21110
Debt collection                                                                 21110
Mortgage                                                                        21110
Student loan                                                                    21110
Name: Product, dtype: int64

In [14]:
#nombre de modalités de la variable 'Issue'
df_balanced.Issue.value_counts()

Incorrect information on your report                                                21816
Incorrect information on credit report                                              21110
Managing an account                                                                 20976
Dealing with your lender or servicer                                                19118
Problem with a purchase shown on your statement                                     15714
Trouble during payment process                                                      13406
Improper use of your report                                                         12251
Problem with a credit reporting company's investigation into an existing problem    10994
Attempts to collect debt not owed                                                    9168
Struggling to pay mortgage                                                           6823
Problem with a company's investigation into an existing problem                      5562
Written no

In [15]:
#Remove stop words
import warnings
warnings.filterwarnings("ignore")
from nltk.corpus import stopwords
import nltk

In [16]:
stopword=set(stopwords.words('english'))

In [17]:
#Text processing
import string
import re
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    #text = [stemmer.stem(word) for word in text.split(' ')]
    #text=" ".join(text)
    return text

In [18]:
df_balanced['clean_text'] = df_balanced["Consumer complaint narrative"].apply(clean)

In [20]:
#Exemple d'une plainte: texte brut
df_balanced["Consumer complaint narrative"].iloc[1]

'On XX/XX/2023, My checking account was restricted with Chase Bank. They said my account was being close and wishes to not have me as a customer, did not give any information beyond that. There was {$18000.00} in my joint checking account. \n\nI called every week, they told me 10 business days later they would over night a check of my funds. \n\nOn XX/XX/2023. I called and a representative told me she did not see any funds being released. I escalated to a supervisor that did confirm the funds, but they were withdrawn to an internal chase account to their internal investigation bureau. I asked for contact information with this investigation bureau and was told there is not way to contact them and to just wait. Could not tell me how long to wait, and just wait.\n\nChase did not send me a letter in the mail, or contact me. I only found out my account was restricted when a friend tried to send me money through XXXX, that was when I discovered my account was restricted on XX/XX/XXXX.'

In [21]:
# Exemple d'une plainte après nettoyage
df_balanced["clean_text"].iloc[1]

' checking account restricted chase bank said account close wishes customer give information beyond  joint checking account called every week told  business days later would night check funds  called representative told see funds released escalated supervisor confirm funds withdrawn internal chase account internal investigation bureau asked contact information investigation bureau told way contact wait could tell long wait waitchase send letter mail contact found account restricted friend tried send money xxxx discovered account restricted xxxxxxxx'

In [22]:
data=df_balanced[["Product", "clean_text", "Issue"]].copy()

In [23]:
data.columns=["category", "complaint", "Issue"]

In [24]:
data.head(2)

Unnamed: 0,category,complaint,Issue
0,Checking or savings account,deposited usps money order xxxxxxxx bank amer...,Managing an account
1,Checking or savings account,checking account restricted chase bank said a...,Managing an account


In [25]:
# Encoder la variable cible "category"
data["category_id"]= data["category"].factorize()[0]

In [26]:
category_id_df = data[['category', 'category_id']].drop_duplicates()

In [27]:
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'category']].values)

### Développement du modèle

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import chi2
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics

In [103]:
#pip install -U textblob

Note: you may need to restart the kernel to use updated packages.


In [29]:
# Feature engineering
count_vec = CountVectorizer(max_df=0.90,min_df=2,
                           max_features=1000,stop_words='english')

bagofword_vec = count_vec.fit_transform(data['complaint'])
bagofword_vec
labels =data.category_id

In [30]:
#extraction des prédicteurs et cible
X= data.loc[:, 'complaint']
y= data.loc[:, 'category_id']

In [31]:
# split des data
X_train, X_test, y_train, y_test,indices_train, indices_test = train_test_split(X,y,data.index,
                                                    test_size=0.2)
#X_train.shape,X_test.shape

In [32]:
xtrain_cv = count_vec.fit_transform(X_train)
xtest_cv = count_vec.transform(X_test)

### Entrainement du modèle:
Le modèle sera entrainé avec plusieurs algorithmes de classification. Le choix du modèle final se fera sur la base de la performance: temps d'exécution et métriques

In [36]:
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier

In [37]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(xtrain_cv,y_train)

#Predict the response for test dataset
y_pred = clf.predict(xtrain_cv)
clf.score(xtest_cv,y_test)

0.6994315490288963

In [38]:
from sklearn.linear_model import SGDClassifier

In [39]:
%%time
sgdc = SGDClassifier()
sgdc.fit(xtrain_cv, y_train)
#sgdc.fit(xtrain_cv, y_train)
sgdc.score(xtest_cv, y_test)

CPU times: user 10.2 s, sys: 52.6 ms, total: 10.2 s
Wall time: 10.3 s


0.7110966366650876

In [40]:
from sklearn.linear_model import LogisticRegression

In [41]:
%%time
classifier = LogisticRegression()
classifier.fit(xtrain_cv, y_train)
score = classifier.score(xtest_cv, y_test)

print("Accuracy:", score)

Accuracy: 0.7523685457129322
CPU times: user 28.6 s, sys: 901 ms, total: 29.5 s
Wall time: 7.71 s


In [42]:
%%time
mb = MultinomialNB()
mb.fit(xtrain_cv, y_train.ravel())
#mb.fit(xtrain_cv,y_train)
mbpred = mb.predict(xtest_cv)
print(metrics.accuracy_score(y_test,mbpred))

0.7132579346281384
CPU times: user 76.9 ms, sys: 5.32 ms, total: 82.2 ms
Wall time: 81.2 ms


In [43]:
%%time
rf = RandomForestClassifier()
rf.fit(xtrain_cv,y_train)
print(metrics.accuracy_score(y_test,rf.predict(xtest_cv)))

0.8056016106110848
CPU times: user 9min 27s, sys: 1.46 s, total: 9min 28s
Wall time: 9min 31s


In [124]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-2.0.3
Note: you may need to restart the kernel to use updated packages.


In [33]:
import xgboost as xgb

In [35]:
%%time
# Train the XGBoost model
model = xgb.XGBClassifier(learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 n_jobs=-1)
model.fit(xtrain_cv, y_train)
predictions = model.predict(xtest_cv)
# Evaluate the model performance
accuracy = metrics.accuracy_score(y_test, predictions)
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 80.73%
CPU times: user 5min 49s, sys: 53.4 s, total: 6min 42s
Wall time: 52 s


### Issues: 
Développer un 2e modèle qui va prédire 'issue' de la plainte. Donc en plus de la catégorie, on va pouvoir classifier les plaintes selon 'issue'

In [46]:
data.head(3)

Unnamed: 0,category,complaint,Issue,category_id
0,Checking or savings account,wells fargo bank account timeframe listed sett...,Managing an account,0
1,Checking or savings account,opened pnc account part account opening inclu...,Managing an account,0
2,Checking or savings account,regarding td bank routing number xxxx account ...,Managing an account,0


In [47]:
df_balanced.Issue.value_counts()

Incorrect information on your report                                                21770
Incorrect information on credit report                                              21110
Managing an account                                                                 20974
Dealing with your lender or servicer                                                19118
Problem with a purchase shown on your statement                                     15734
Trouble during payment process                                                      13444
Improper use of your report                                                         12180
Problem with a credit reporting company's investigation into an existing problem    10988
Attempts to collect debt not owed                                                    9135
Struggling to pay mortgage                                                           6772
Problem with a company's investigation into an existing problem                      5680
Written no

In [48]:
data2= pd.DataFrame(data, columns=['complaint', 'Issue'])

In [49]:
data2.head(2)

Unnamed: 0,complaint,Issue
0,wells fargo bank account timeframe listed sett...,Managing an account
1,opened pnc account part account opening inclu...,Managing an account


In [50]:
# Encoder la variable cible "category"
data2["issue_id"]= data2["Issue"].factorize()[0]

In [51]:
category_id_df_issue = data2[['Issue', 'issue_id']].drop_duplicates()

In [52]:
category_to_id_issue = dict(category_id_df_issue.values)
id_to_category_issue = dict(category_id_df_issue[['issue_id', 'Issue']].values)

In [53]:
X_issue= data2.loc[:, 'complaint']
y_issue= data2.loc[:, 'issue_id']

In [54]:
X_train_issue, X_test_issue, y_train_issue, y_test_issue = train_test_split(X_issue,y_issue,
                                                    test_size=0.2)
X_train_issue.shape,X_test_issue.shape

((135104,), (33776,))

In [55]:
xtrain_cv_issue = count_vec.fit_transform(X_train_issue)
xtest_cv_issue = count_vec.transform(X_test_issue)
xtrain_cv_issue.shape

(135104, 1000)

In [124]:
# Create Decision Tree classifer object
clf_issue = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf_issue = clf_issue.fit(xtrain_cv_issue,y_train_issue)

#Predict the response for test dataset
#y_pred = clf_issue.predict(X_train_issue)
clf_issue.score(xtest_cv_issue,y_test_issue)

0.6287896731406916

In [127]:
%%time
from sklearn.linear_model import SGDClassifier

sgdc_issue = SGDClassifier()
sgdc_issue.fit(xtrain_cv_issue, y_train_issue)
sgdc_issue.score(xtest_cv_issue, y_test_issue)

CPU times: user 15.5 s, sys: 72.8 ms, total: 15.6 s
Wall time: 15.7 s


0.6392408810990052

In [128]:
%%time
classifier_issue = LogisticRegression()
classifier_issue.fit(xtrain_cv_issue, y_train_issue)
score_issue = classifier_issue.score(xtest_cv_issue, y_test_issue)

print("Accuracy:", score_issue)

Accuracy: 0.6808680720037896
CPU times: user 34.9 s, sys: 38.3 s, total: 1min 13s
Wall time: 10.3 s


In [129]:
%%time
mb_issue = MultinomialNB()
mb_issue.fit(xtrain_cv_issue,y_train_issue)
mbpred_issue = mb_issue.predict(xtest_cv_issue)
print(metrics.accuracy_score(y_test_issue,mbpred_issue))

0.6457839886309806
CPU times: user 81.4 ms, sys: 41.1 ms, total: 122 ms
Wall time: 126 ms


In [130]:
%%time
rf_issue = RandomForestClassifier()
rf_issue.fit(xtrain_cv_issue,y_train_issue)
print(metrics.accuracy_score(y_test_issue,rf_issue.predict(xtest_cv_issue)))

0.7398744670772146
CPU times: user 9min 52s, sys: 2.1 s, total: 9min 54s
Wall time: 9min 57s


In [56]:
%%time
# Train the XGBoost model
model_issue = xgb.XGBClassifier(n_jobs=-1)
model_issue.fit(xtrain_cv_issue, y_train_issue)
predictions_issue = model_issue.predict(xtest_cv_issue)
# Evaluate the model performance
accuracy_issue = metrics.accuracy_score(y_test_issue, predictions_issue)
print("Accuracy: {:.2f}%".format(accuracy_issue * 100))

Accuracy: 73.55%
CPU times: user 1min 39s, sys: 14.8 s, total: 1min 54s
Wall time: 15.9 s


In [85]:
example= data2['complaint'].iloc[50000]

In [86]:
example

'divorce decree indicate ex wife pay medical expenses created children collection placed report resulting failure pay medical bills daughter disputed issue experian sent report stating collection remain work payment arrangement creditor'

In [87]:
example= count_vec.transform([example])

In [88]:
model_issue.predict(example)

array([6])

In [67]:
category_to_id_issue
#id_to_category_issue

{'Managing an account': 0,
 'Incorrect information on your report': 1,
 'Improper use of your report': 2,
 "Problem with a credit reporting company's investigation into an existing problem": 3,
 "Problem with a company's investigation into an existing problem": 4,
 'Problem with a purchase shown on your statement': 5,
 'Incorrect information on credit report': 6,
 'Written notification about debt': 7,
 'Attempts to collect debt not owed': 8,
 "Cont'd attempts collect debt not owed": 9,
 'False statements or representation': 10,
 'Took or threatened to take negative or legal action': 11,
 'Communication tactics': 12,
 'Trouble during payment process': 13,
 'Struggling to pay mortgage': 14,
 'Dealing with your lender or servicer': 15}

In [84]:
data2.iloc[50000]

complaint    divorce decree indicate ex wife pay medical ex...
Issue                   Incorrect information on credit report
issue_id                                                     6
Name: 7780, dtype: object

### Evidently AI

In [132]:
pip install evidently

Collecting evidently
  Downloading evidently-0.4.12-py3-none-any.whl (16.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.6/16.6 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting uvicorn>=0.22.0
  Downloading uvicorn-0.25.0-py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.3/60.3 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rich>=13
  Downloading rich-13.7.0-py3-none-any.whl (240 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.6/240.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting fastapi-restful>=0.5.0
  Downloading fastapi_restful-0.5.0-py3-none-any.whl (18 kB)
Collecting typing-inspect>=0.9.0
  Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)
Collecting watchdog>=3
  Downloading watchdog-3.0.0-cp39-cp39-macosx_10_9_x86_64.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91

      Successfully uninstalled typing_extensions-4.3.0
  Attempting uninstall: pygments
    Found existing installation: Pygments 2.11.2
    Uninstalling Pygments-2.11.2:
      Successfully uninstalled Pygments-2.11.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spyder 5.3.3 requires pyqt5<5.16, which is not installed.
spyder 5.3.3 requires pyqtwebengine<5.16, which is not installed.
tensorflow 2.13.0 requires typing-extensions<4.6.0,>=3.6.6, but you have typing-extensions 4.9.0 which is incompatible.[0m[31m
[0mSuccessfully installed distro-1.9.0 evidently-0.4.12 fastapi-0.108.0 fastapi-restful-0.5.0 h11-0.14.0 iterative-telemetry-0.0.8 markdown-it-py-3.0.0 mdurl-0.1.2 pydantic-1.10.13 pygments-2.17.2 rich-13.7.0 starlette-0.32.0.post1 typer-0.9.0 typing-extensions-4.9.0 typing-inspect-0.9.0 uvicorn-0.25.0 watchdog-3.0.0
Note: you may need to restart

In [1]:
import evidently
evidently.__version__

'0.4.12'

In [6]:
!jupyter nbextension install --sys-prefix --symlink --overwrite --py evidently

Installing /Users/omarsaaoui/opt/anaconda3/lib/python3.9/site-packages/evidently/nbextension/static -> evidently
Removing: /Users/omarsaaoui/opt/anaconda3/share/jupyter/nbextensions/evidently
Symlinking: /Users/omarsaaoui/opt/anaconda3/share/jupyter/nbextensions/evidently -> /Users/omarsaaoui/opt/anaconda3/lib/python3.9/site-packages/evidently/nbextension/static
- Validating: [32mOK[0m

    To initialize this nbextension in the browser every time the notebook (or other app) loads:
    
          jupyter nbextension enable evidently --py --sys-prefix
    


In [136]:
pip install jupyter_contrib_nbextensions

Collecting jupyter_contrib_nbextensions
  Downloading jupyter_contrib_nbextensions-0.7.0.tar.gz (23.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.5/23.5 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting jupyter_contrib_core>=0.3.3
  Downloading jupyter_contrib_core-0.4.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting jupyter_highlight_selected_word>=0.1.1
  Downloading jupyter_highlight_selected_word-0.2.0-py2.py3-none-any.whl (11 kB)
Collecting jupyter_nbextensions_configurator>=0.4.0
  Downloading jupyter_nbextensions_configurator-0.6.3-py2.py3-none-any.whl (466 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m466.9/466.9 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m


Building wheels for collected packages: jupyter_contrib_nbextensions, jupyter_contrib_core
  Building wheel for jupyter_contrib_nbextensions (setup.py) ... [?25ldone
[?25h  Created wheel for jupyter_contrib_nbextensions: filename=jupyter_contrib_nbextensions-0.7.0-py2.py3-none-any.whl size=23428784 sha256=db16dd9784de5a3d197df3e0941fc384112d4b2e7dae5a582e543f779b187d00
  Stored in directory: /Users/omarsaaoui/Library/Caches/pip/wheels/e7/99/91/7f24a075786a6760f3ac32ab5fa92e1d1a90e0d2cd8958adfd
  Building wheel for jupyter_contrib_core (setup.py) ... [?25ldone
[?25h  Created wheel for jupyter_contrib_core: filename=jupyter_contrib_core-0.4.2-py2.py3-none-any.whl size=17484 sha256=d147b1f4c21d353ce88d291b19e8ec44551d29ba6d835b0b9e5c2a3e99a13d4a
  Stored in directory: /Users/omarsaaoui/Library/Caches/pip/wheels/57/9f/80/32c07b8a950a45f6cf8cd5980c22a27ce514c27b795250e497
Successfully built jupyter_contrib_nbextensions jupyter_contrib_core
Installing collected packages: jupyter_highligh

In [8]:
!jupyter nbextension enable evidently --py --sys-prefix

Enabling notebook extension evidently/extension...
      - Validating: [32mOK[0m
