In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor, LocalOutlierFactor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, roc_curve

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Internship/Neuronetix/telecom_churn_processed4modeling.csv")
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,0,1,0,1,0,-1,1,0,1,0,0,0,0,1,1,1,29.85,29.85,0
1,1,0,0,0,34,1,0,1,1,0,1,0,0,0,12,0,2,56.95,1889.5,0
2,1,0,0,0,2,1,0,1,1,1,0,0,0,0,1,1,2,53.85,108.15,1
3,1,0,0,0,45,0,-1,1,1,0,1,1,0,0,12,0,3,42.3,1840.75,0
4,0,0,0,0,2,1,0,2,0,0,0,0,0,0,1,1,1,70.7,151.65,1


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   int64  
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   int64  
 3   Dependents        7043 non-null   int64  
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   int64  
 6   MultipleLines     7043 non-null   int64  
 7   InternetService   7043 non-null   int64  
 8   OnlineSecurity    7043 non-null   int64  
 9   OnlineBackup      7043 non-null   int64  
 10  DeviceProtection  7043 non-null   int64  
 11  TechSupport       7043 non-null   int64  
 12  StreamingTV       7043 non-null   int64  
 13  StreamingMovies   7043 non-null   int64  
 14  Contract          7043 non-null   int64  
 15  PaperlessBilling  7043 non-null   int64  
 16  PaymentMethod     7043 non-null   int64  


In [None]:
# Convert 'TotalCharges' column to numeric, handling errors
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Drop rows with missing values in 'TotalCharges'
df.dropna(subset=['TotalCharges'], inplace=True)

In [None]:
X = df.drop(["TotalCharges", "Churn"], axis=1)
y = df["TotalCharges"]

In [None]:
df.shape

(7032, 20)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the models and their parameter grids
models = {
    'Linear Regression': {
        'model': LinearRegression(),
        'params': {}
    },
    'Lasso': {
        'model': Lasso(),
        'params': {
            'alpha': [0.1, 0.5, 1.0, 2.0]
        }
    },
    'Ridge': {
        'model': Ridge(),
        'params': {
            'alpha': [0.1, 0.5, 1.0, 2.0]
        }
    },
    'SVR': {
        'model': SVR(),
        'params': {
            'kernel': ['linear', 'poly', 'rbf'],
            'C': [0.1, 1, 10]
        }
    },
    'Decision Tree': {
        'model': DecisionTreeRegressor(),
        'params': {
            'max_depth': [3, 5, 7, 10],
            'min_samples_split': [2, 5, 10]
        }
    },
    'Random Forest': {
        'model': RandomForestRegressor(),
        'params': {
            'n_estimators': [100, 200, 300],
            'max_depth': [5, 10, 15]
        }
    },
    'Gradient Boosting': {
        'model': GradientBoostingRegressor(),
        'params': {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.01, 0.1, 0.5]
        }
    },
    'AdaBoost': {
        'model': AdaBoostRegressor(),
        'params': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 0.5]
        }
    },
    'KNeighbors': {
        'model': KNeighborsRegressor(),
        'params': {
            'n_neighbors': [3, 5, 7],
            'weights': ['uniform', 'distance']
        }
    },
    'XGBoost': {
        'model': XGBRegressor(),
        'params': {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.01, 0.1, 0.5]
        }
    },
    'LightGBM': {
        'model': LGBMRegressor(),
        'params': {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.01, 0.1, 0.5]
        }
    }
}

In [None]:
import time

%time
reg_results = []

for model_name, model_info in models.items():
    model = model_info['model']
    params = model_info['params']

    grid_search = GridSearchCV(model, params, cv=5, scoring='neg_mean_squared_error')
    start_time = time.time()
    grid_search.fit(X_train, y_train)
    end_time = time.time()
    fit_time = end_time - start_time

    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    best_score = -grid_search.best_score_
    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    reg_results.append({
        'Model': model_name,
        'Best Parameters': best_params,
        'Best Score': f"{best_score:0.2f}",
        'MSE': f"{mse:0.2f}",
        'R2': f"{r2:0.6f}",
        'Fit Time': f"{fit_time:0.2f} sec"
    })

reg_results_df = pd.DataFrame(reg_results).sort_values(by='R2', ascending=False)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.39 µs
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000493 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 373
[LightGBM] [Info] Number of data points in the train set: 3937, number of used features: 18
[LightGBM] [Info] Start training from score 2287.749581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000453 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 373
[LightGBM] [Info] Number of data points in the train set: 3937, number of used features: 18
[LightGBM] [Info] Start training from score 2274.688583
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000690 seconds.
You can set `force

In [None]:
reg_results_df

Unnamed: 0,Model,Best Parameters,Best Score,MSE,R2,Fit Time
5,Random Forest,"{'max_depth': 10, 'n_estimators': 300}",6714.28,6151.22,0.998854,128.19 sec
10,LightGBM,"{'learning_rate': 0.1, 'n_estimators': 100}",7018.43,6166.46,0.998852,9.55 sec
9,XGBoost,"{'learning_rate': 0.1, 'n_estimators': 100}",6720.85,6183.73,0.998848,14.86 sec
6,Gradient Boosting,"{'learning_rate': 0.1, 'n_estimators': 300}",8346.82,7302.55,0.99864,52.98 sec
8,KNeighbors,"{'n_neighbors': 7, 'weights': 'distance'}",9031.36,8000.56,0.99851,0.90 sec
4,Decision Tree,"{'max_depth': 10, 'min_samples_split': 10}",14364.44,12632.6,0.997647,1.33 sec
7,AdaBoost,"{'learning_rate': 0.5, 'n_estimators': 200}",100476.56,104585.94,0.980521,44.80 sec
3,SVR,"{'C': 10, 'kernel': 'poly'}",128076.94,132573.94,0.975309,75.47 sec
1,Lasso,{'alpha': 0.1},483150.18,506343.12,0.905696,1.79 sec
2,Ridge,{'alpha': 1.0},483148.59,506353.95,0.905694,0.97 sec


In [None]:
lgbm_model = LGBMRegressor(learning_rate=0.1, n_estimators=100)
lgbm_model.fit(X_train, y_train)
y_pred = lgbm_model.predict(X_test)

testnpred_df = pd.DataFrame({"y_test": y_test, "y_pred": y_pred, "diff": np.abs(y_test - y_pred)})
testnpred_df.to_csv("/content/drive/MyDrive/Colab Notebooks/Internship/Neuronetix/Supervised/Results/LightGBM.csv", index=False)
testnpred_df.head(10)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000616 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 373
[LightGBM] [Info] Number of data points in the train set: 4922, number of used features: 18
[LightGBM] [Info] Start training from score 2284.061469


Unnamed: 0,y_test,y_pred,diff
5974,218.5,192.075285,26.424715
5699,1219.85,1297.540641,77.690641
3369,580.8,648.755542,67.955542
1224,45.85,44.782971,1.067029
3416,837.5,824.927054,12.572946
2448,1531.4,1457.855238,73.544762
2833,4017.45,4061.236783,43.786783
4286,801.3,827.171256,25.871256
5790,2036.55,2105.994163,69.444163
634,2236.2,2232.744059,3.455941


In [None]:
randf_model = RandomForestRegressor(max_depth=10, n_estimators=300)
randf_model.fit(X_train, y_train)
y_pred = randf_model.predict(X_test)

testnpred_df = pd.DataFrame({"y_test": y_test, "y_pred": y_pred, "diff": np.abs(y_test - y_pred)})
testnpred_df.to_csv("/content/drive/MyDrive/Colab Notebooks/Internship/Neuronetix/Supervised/Results/RandomForest.csv", index=False)
testnpred_df.head(10)

Unnamed: 0,y_test,y_pred,diff
5974,218.5,204.008502,14.491498
5699,1219.85,1344.970255,125.120255
3369,580.8,617.020403,36.220403
1224,45.85,45.470189,0.379811
3416,837.5,855.757306,18.257306
2448,1531.4,1333.58894,197.81106
2833,4017.45,4013.691408,3.758592
4286,801.3,808.112454,6.812454
5790,2036.55,2073.219043,36.669043
634,2236.2,2205.489554,30.710446


In [None]:
xgb_model = XGBRegressor(learning_rate=0.1, n_estimators=100)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

testnpred_df = pd.DataFrame({"y_test": y_test, "y_pred": y_pred, "diff": np.abs(y_test - y_pred)})
testnpred_df.to_csv("/content/drive/MyDrive/Colab Notebooks/Internship/Neuronetix/Supervised/Results/XGBoost.csv", index=False)
testnpred_df.head(10)

Unnamed: 0,y_test,y_pred,diff
5974,218.5,204.784912,13.715088
5699,1219.85,1361.877441,142.027441
3369,580.8,641.382507,60.582507
1224,45.85,47.283894,1.433894
3416,837.5,844.118164,6.618164
2448,1531.4,1440.91394,90.48606
2833,4017.45,4064.537354,47.087354
4286,801.3,824.170471,22.870471
5790,2036.55,2076.887207,40.337207
634,2236.2,2287.261963,51.061963


In [None]:
gradb_model = GradientBoostingRegressor(learning_rate=0.1, n_estimators=300)
gradb_model.fit(X_train, y_train)
y_pred = gradb_model.predict(X_test)

testnpred_df = pd.DataFrame({"y_test": y_test, "y_pred": y_pred, "diff": np.abs(y_test - y_pred)})
testnpred_df.to_csv("/content/drive/MyDrive/Colab Notebooks/Internship/Neuronetix/Supervised/Results/GradientBoosting.csv", index=False)
testnpred_df.head(10)

Unnamed: 0,y_test,y_pred,diff
5974,218.5,217.590921,0.909079
5699,1219.85,1239.539533,19.689533
3369,580.8,637.046947,56.246947
1224,45.85,39.745595,6.104405
3416,837.5,721.811239,115.688761
2448,1531.4,1446.601262,84.798738
2833,4017.45,4093.600031,76.150031
4286,801.3,825.832804,24.532804
5790,2036.55,2025.336141,11.213859
634,2236.2,2158.469972,77.730028


In [None]:
knn_model = KNeighborsRegressor(n_neighbors=7, weights='distance')
knn_model.fit(X_train, y_train)
y_pred = knn_model.predict(X_test)

testnpred_df = pd.DataFrame({"y_test": y_test, "y_pred": y_pred, "diff": np.abs(y_test - y_pred)})
testnpred_df.to_csv("/content/drive/MyDrive/Colab Notebooks/Internship/Neuronetix/Supervised/Results/KNN.csv", index=False)
testnpred_df.head(10)

Unnamed: 0,y_test,y_pred,diff
5974,218.5,211.693979,6.806021
5699,1219.85,1228.029048,8.179048
3369,580.8,610.309323,29.509323
1224,45.85,45.453132,0.396868
3416,837.5,829.263314,8.236686
2448,1531.4,1440.35898,91.04102
2833,4017.45,3976.546045,40.903955
4286,801.3,802.987869,1.687869
5790,2036.55,2041.424047,4.874047
634,2236.2,2285.357693,49.157693


In [None]:
tree_model = DecisionTreeRegressor(max_depth=10, min_samples_split=2)
tree_model.fit(X_train, y_train)
y_pred = tree_model.predict(X_test)

testnpred_df = pd.DataFrame({"y_test": y_test, "y_pred": y_pred, "diff": np.abs(y_test - y_pred)})
testnpred_df.to_csv("/content/drive/MyDrive/Colab Notebooks/Internship/Neuronetix/Supervised/Results/DecisionTree.csv", index=False)
testnpred_df.head(10)

Unnamed: 0,y_test,y_pred,diff
5974,218.5,198.31,20.19
5699,1219.85,1386.225,166.375
3369,580.8,619.7,38.9
1224,45.85,45.546154,0.303846
3416,837.5,879.907692,42.407692
2448,1531.4,1275.35,256.05
2833,4017.45,4111.85,94.4
4286,801.3,810.58,9.28
5790,2036.55,2054.8,18.25
634,2236.2,2257.25,21.05
