In [None]:
import pandas as pd

# URL of the data
url = "https://raw.githubusercontent.com/phyokyi/datatalk_midterm_project_202311/main/data.tsv"

# Use pandas to read the data
data = pd.read_csv(url, sep='\t')
data.columns = data.columns.str.lower().str.replace(' ', '_')
data['dt_customer'] = pd.to_datetime(data['dt_customer']).dt.year
data.fillna(0, inplace=True)

# Display the first few rows of the data
print(data.head())

     id  year_birth   education marital_status   income  kidhome  teenhome  \
0  5524        1957  Graduation         Single  58138.0        0         0   
1  2174        1954  Graduation         Single  46344.0        1         1   
2  4141        1965  Graduation       Together  71613.0        0         0   
3  6182        1984  Graduation       Together  26646.0        1         0   
4  5324        1981         PhD        Married  58293.0        1         0   

   dt_customer  recency  mntwines  ...  numwebvisitsmonth  acceptedcmp3  \
0         2012       58       635  ...                  7             0   
1         2014       38        11  ...                  5             0   
2         2013       26       426  ...                  4             0   
3         2014       26        11  ...                  6             0   
4         2014       94       173  ...                  5             0   

   acceptedcmp4  acceptedcmp5  acceptedcmp1  acceptedcmp2  complain  \
0        

  data['dt_customer'] = pd.to_datetime(data['dt_customer']).dt.year


In [None]:
data.columns

Index(['id', 'year_birth', 'education', 'marital_status', 'income', 'kidhome',
       'teenhome', 'dt_customer', 'recency', 'mntwines', 'mntfruits',
       'mntmeatproducts', 'mntfishproducts', 'mntsweetproducts',
       'mntgoldprods', 'numdealspurchases', 'numwebpurchases',
       'numcatalogpurchases', 'numstorepurchases', 'numwebvisitsmonth',
       'acceptedcmp3', 'acceptedcmp4', 'acceptedcmp5', 'acceptedcmp1',
       'acceptedcmp2', 'complain', 'z_costcontact', 'z_revenue', 'response'],
      dtype='object')

In [None]:
print(data.isnull().sum())

id                     0
year_birth             0
education              0
marital_status         0
income                 0
kidhome                0
teenhome               0
dt_customer            0
recency                0
mntwines               0
mntfruits              0
mntmeatproducts        0
mntfishproducts        0
mntsweetproducts       0
mntgoldprods           0
numdealspurchases      0
numwebpurchases        0
numcatalogpurchases    0
numstorepurchases      0
numwebvisitsmonth      0
acceptedcmp3           0
acceptedcmp4           0
acceptedcmp5           0
acceptedcmp1           0
acceptedcmp2           0
complain               0
z_costcontact          0
z_revenue              0
response               0
dtype: int64


In [None]:
data_encoded = pd.get_dummies(data)
data_encoded.columns = data_encoded.columns.str.lower().str.replace(' ', '_')

In [None]:
data_encoded

Unnamed: 0,id,year_birth,income,kidhome,teenhome,dt_customer,recency,mntwines,mntfruits,mntmeatproducts,...,education_master,education_phd,marital_status_absurd,marital_status_alone,marital_status_divorced,marital_status_married,marital_status_single,marital_status_together,marital_status_widow,marital_status_yolo
0,5524,1957,58138.0,0,0,2012,58,635,88,546,...,0,0,0,0,0,0,1,0,0,0
1,2174,1954,46344.0,1,1,2014,38,11,1,6,...,0,0,0,0,0,0,1,0,0,0
2,4141,1965,71613.0,0,0,2013,26,426,49,127,...,0,0,0,0,0,0,0,1,0,0
3,6182,1984,26646.0,1,0,2014,26,11,4,20,...,0,0,0,0,0,0,0,1,0,0
4,5324,1981,58293.0,1,0,2014,94,173,43,118,...,0,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,10870,1967,61223.0,0,1,2013,46,709,43,182,...,0,0,0,0,0,1,0,0,0,0
2236,4001,1946,64014.0,2,1,2014,56,406,0,30,...,0,1,0,0,0,0,0,1,0,0
2237,7270,1981,56981.0,0,0,2014,91,908,48,217,...,0,0,0,0,1,0,0,0,0,0
2238,8235,1956,69245.0,0,1,2014,8,428,30,214,...,1,0,0,0,0,0,0,1,0,0


In [None]:
# Compute the correlation matrix
correlation_matrix = data_encoded.corr()

# Get the correlation of each feature with the target variable
correlation_with_target = correlation_matrix['complain'].sort_values(ascending=False)

# Display the most correlated features
print(correlation_with_target)

complain                   1.000000
kidhome                    0.040207
id                         0.033883
education_2n_cycle         0.033837
education_graduation       0.031820
numwebvisitsmonth          0.019769
marital_status_single      0.016935
recency                    0.013231
acceptedcmp3               0.008415
teenhome                   0.003138
numdealspurchases          0.000420
marital_status_married    -0.000952
response                  -0.001707
marital_status_divorced   -0.002661
marital_status_yolo       -0.002908
marital_status_absurd     -0.002908
marital_status_alone      -0.003563
marital_status_together   -0.004627
mntfruits                 -0.005166
acceptedcmp5              -0.009419
acceptedcmp2              -0.011334
education_basic           -0.015290
numwebpurchases           -0.016310
numstorepurchases         -0.016524
education_master          -0.018323
marital_status_widow      -0.018355
numcatalogpurchases       -0.020453
mntfishproducts           -0

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

# Separate the features and the target variable
X = data_encoded.drop('complain', axis=1)
y = data_encoded['complain']

# Apply SelectKBest class to extract top features
best_features = SelectKBest(score_func=f_classif, k=5)
fit = best_features.fit(X, y)

# Display the scores of the features
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

# Concatenate dataframes for better visualization
feature_scores = pd.concat([dfcolumns, dfscores], axis=1)
feature_scores.columns = ['Feature', 'Score']
feature_scores = feature_scores.nlargest(11, 'Score')

# Display the most important features
print(feature_scores)

                 Feature     Score
3                kidhome  3.623727
30         education_phd  3.580903
7               mntwines  3.410434
0                     id  2.572312
26    education_2n_cycle  2.565282
28  education_graduation  2.268276
12          mntgoldprods  2.133537
1             year_birth  2.033209
19          acceptedcmp4  1.707538
21          acceptedcmp1  1.456048
2                 income  1.342916


  f = msb / msw


In [None]:
data_encoded.columns

Index(['id', 'year_birth', 'income', 'kidhome', 'teenhome', 'dt_customer',
       'recency', 'mntwines', 'mntfruits', 'mntmeatproducts',
       'mntfishproducts', 'mntsweetproducts', 'mntgoldprods',
       'numdealspurchases', 'numwebpurchases', 'numcatalogpurchases',
       'numstorepurchases', 'numwebvisitsmonth', 'acceptedcmp3',
       'acceptedcmp4', 'acceptedcmp5', 'acceptedcmp1', 'acceptedcmp2',
       'complain', 'z_costcontact', 'z_revenue', 'response',
       'education_2n_cycle', 'education_basic', 'education_graduation',
       'education_master', 'education_phd', 'marital_status_absurd',
       'marital_status_alone', 'marital_status_divorced',
       'marital_status_married', 'marital_status_single',
       'marital_status_together', 'marital_status_widow',
       'marital_status_yolo'],
      dtype='object')

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Select the specified columns
selected_columns = ['kidhome', 'education_phd', 'education_2n_cycle', 'education_graduation', 'mntwines', 'mntgoldprods', 'year_birth', 'income','complain']
data_selected = data_encoded[selected_columns]

# Split the data into features and target variable
X = data_selected.drop('complain', axis=1)
y = data_selected['complain']

# Perform feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the model
accuracy = model.score(X_test, y_test)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9977678571428571


In [None]:
import pickle
with open('complain_forecast.pkl', 'wb') as file:
    pickle.dump(model, file)

In [None]:
with open('complain_forecast.pkl', 'rb') as file:
    model_x = pickle.load(file)

In [None]:
import json
import numpy as np
json_sample = '{"kidhome": 1, "education_phd": 1, "education_2n_cycle": 0, "education_graduation": 0, "mntwines": 10, "mntgoldprods": 10, "year_birth": 1984, "income": 40000}' #1
json_sample = '{"kidhome": 0, "education_phd": 1, "education_2n_cycle": 0, "education_graduation": 0, "mntwines": 100, "mntgoldprods": 100, "year_birth": 1990, "income": 40000}' #0


data = json.loads(json_sample)

prediction = model_x.predict(np.array(list(data.values())).reshape(1, -1))
print(prediction[0])

0


In [None]:
import json
import numpy as np
json_sample = '{"kidhome": 1, "education_phd": 1, "education_2n_cycle": 0, "education_graduation": 0, "mntwines": 10, "mntgoldprods": 10, "year_birth": 1984, "income": 40000}' #1
json_sample = '{"kidhome": 0, "education_phd": 1, "education_2n_cycle": 0, "education_graduation": 0, "mntwines": 100, "mntgoldprods": 100, "year_birth": 1990, "income": 40000}' #0


data = json.loads(json_sample)

prediction = model_x.predict(np.array(list(data.values())).reshape(1, -1))
print(prediction[0])

0
