In [26]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import statsmodels.api 
import seaborn as sns
import matplotlib.pyplot as plt
#from pandas_profiling import ProfileReport
from statsmodels.formula.api import ols
from scipy import stats
from scipy.stats import norm
import statsmodels.api as sm
#from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV,train_test_split,cross_val_score,StratifiedKFold,learning_curve
from sklearn.metrics import classification_report,f1_score, recall_score, precision_score, accuracy_score
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.svm import SVC
%matplotlib inline
import os
os.chdir(r'C:\Users\manee\OneDrive\GMAT Documents\Fall 2022 Semester\Predictive Analytics 6337\project')

In [27]:
# --- Functions ---


# Plot learning curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Validation score")

    plt.legend(loc="best")
    return plt



In [28]:
df = pd.read_csv('telecom_customer_churn.csv')

In [29]:
df = df.drop(['Customer ID','Zip Code','Latitude','Longitude','City', 'Churn Category', 'Churn Reason'],axis=1)
df['Customer Status'] = np.where(df['Customer Status'] == "Churned", 1,0)

In [16]:
df

Unnamed: 0,Gender,Age,Married,Number of Dependents,Number of Referrals,Tenure in Months,Offer,Phone Service,Avg Monthly Long Distance Charges,Multiple Lines,...,Payment Method,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Customer Status,Unnamed: 38,Unnamed: 39
0,Female,37,Yes,0,2,9,,Yes,42.39,No,...,Credit Card,65.60,593.30,0.00,0,381.51,974.81,0,,
1,Male,46,No,0,0,9,,Yes,10.69,Yes,...,Credit Card,-4.00,542.40,38.33,10,96.21,610.28,0,total cells,267634.0
2,Male,50,No,0,0,4,Offer E,Yes,33.65,No,...,Bank Withdrawal,73.90,280.85,0.00,0,134.60,415.45,1,missing value cells,
3,Male,78,Yes,0,1,13,Offer D,Yes,27.82,No,...,Bank Withdrawal,98.00,1237.85,0.00,0,361.66,1599.51,1,,
4,Female,75,Yes,0,3,3,,Yes,7.38,No,...,Credit Card,83.90,267.40,0.00,0,22.14,289.54,1,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Female,20,No,0,0,13,Offer D,Yes,46.68,No,...,Credit Card,55.15,742.90,0.00,0,606.84,1349.74,0,,
7039,Male,40,Yes,0,1,22,Offer D,Yes,16.20,Yes,...,Bank Withdrawal,85.10,1873.70,0.00,0,356.40,2230.10,1,,
7040,Male,22,No,0,0,2,Offer E,Yes,18.62,No,...,Credit Card,50.30,92.75,0.00,0,37.24,129.99,0,,
7041,Male,21,Yes,0,5,67,Offer A,Yes,2.12,No,...,Credit Card,67.85,4627.65,0.00,0,142.04,4769.69,0,,


In [30]:
# --- Replacing Nan values according to the EDA ---

df[['Internet Type', 'Online Security', 'Online Backup', 'Device Protection Plan', 
           'Premium Tech Support', 'Streaming TV', 'Streaming Movies','Streaming Music','Unlimited Data']] = df[['Internet Type', 'Online Security', 'Online Backup', 'Device Protection Plan', 
           'Premium Tech Support', 'Streaming TV', 'Streaming Movies','Streaming Music','Unlimited Data']].replace(np.nan,'No')
df['Avg Monthly GB Download'] = df['Avg Monthly GB Download'].replace(np.nan,0.0)
df['Multiple Lines'] = df['Multiple Lines'].replace(np.nan,'No')
df['Avg Monthly Long Distance Charges'] = df['Avg Monthly Long Distance Charges'].replace(np.nan,0.0)

In [21]:
df = df[df['Monthly Charge'] >= 0]

In [31]:
# --- Data Set splitting ---
X = df.drop('Customer Status',axis=1)
y = df['Customer Status']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=321) #scikit learn

In [32]:
pd.Series({c: df.select_dtypes('object')[c].unique() for c in df.select_dtypes('object')})

Gender                                                       [Female, Male]
Married                                                           [Yes, No]
Offer                     [None, Offer E, Offer D, Offer A, Offer B, Off...
Phone Service                                                     [Yes, No]
Multiple Lines                                                    [No, Yes]
Internet Service                                                  [Yes, No]
Internet Type                                 [Cable, Fiber Optic, DSL, No]
Online Security                                                   [No, Yes]
Online Backup                                                     [Yes, No]
Device Protection Plan                                            [No, Yes]
Premium Tech Support                                              [Yes, No]
Streaming TV                                                      [Yes, No]
Streaming Movies                                                  [No, Yes]
Streaming Mu

In [33]:
#--- Label encoding the Ordinal Internet Type Variable ---
ord_encoder = LabelEncoder()
X_train['Internet Type'] = ord_encoder.fit_transform(X_train['Internet Type'])
X_test['Internet Type'] = ord_encoder.transform(X_test['Internet Type'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['Internet Type'] = ord_encoder.fit_transform(X_train['Internet Type'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['Internet Type'] = ord_encoder.transform(X_test['Internet Type'])


In [10]:
#--- Standardizing numeric columns ---
num_col = X_train.select_dtypes('number').columns
std_scaler = StandardScaler()
X_train[num_col] = std_scaler.fit_transform(X_train[num_col])
X_test[num_col] = std_scaler.transform(X_test[num_col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


In [11]:
#--- one hot encoding non ordinal features ---
oh_col = X.select_dtypes('object').drop('Internet Type',axis=1).columns #object columns to be one hot encoded 

oh_encoder = OneHotEncoder(sparse=False)
OH_encoded_train_features = oh_encoder.fit_transform(X_train[oh_col])
OH_encoded_X_train = pd.DataFrame(data=OH_encoded_train_features,index=X_train[oh_col].index,columns=oh_encoder.get_feature_names(X_train[oh_col].columns))

OH_encoded_test_features = oh_encoder.transform(X_test[oh_col])
OH_encoded_X_test = pd.DataFrame(data=OH_encoded_test_features,index=X_test[oh_col].index,columns=oh_encoder.get_feature_names(X_test[oh_col].columns))

In [12]:
X_train = pd.concat([X_train,OH_encoded_X_train],axis=1)
X_train = X_train.drop(X_train[oh_col],axis=1)
X_test = pd.concat([X_test,OH_encoded_X_test],axis=1)
X_test = X_test.drop(X_test[oh_col],axis=1)

In [13]:
X_train['Internet Type']

4044    1.347971
1008    0.265293
3954    0.265293
1696    0.265293
2022    0.265293
          ...   
6391   -1.900062
399     1.347971
1844   -0.817384
2996   -0.817384
5866    0.265293
Name: Internet Type, Length: 5538, dtype: float64

In [14]:
X_train = X_train.drop(['Gender_Female', 'Married_No', 'Offer_None', 'Phone Service_No', 'Multiple Lines_No', 
              'Internet Service_No','Online Security_No', 'Online Backup_No',
              'Device Protection Plan_No', 'Premium Tech Support_No', 'Streaming TV_No',
             'Streaming Movies_No','Streaming Music_No','Unlimited Data_No', 
              'Contract_Month-to-Month', 'Paperless Billing_No','Payment Method_Bank Withdrawal'],axis=1)

In [15]:
X_test = X_test.drop(['Gender_Female', 'Married_No', 'Offer_None', 'Phone Service_No', 'Multiple Lines_No', 
              'Internet Service_No','Online Security_No', 'Online Backup_No',
              'Device Protection Plan_No', 'Premium Tech Support_No', 'Streaming TV_No',
             'Streaming Movies_No','Streaming Music_No','Unlimited Data_No', 
              'Contract_Month-to-Month', 'Paperless Billing_No','Payment Method_Bank Withdrawal'],axis=1)

In [16]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5538 entries, 4044 to 5866
Data columns (total 36 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Age                                5538 non-null   float64
 1   Number of Dependents               5538 non-null   float64
 2   Number of Referrals                5538 non-null   float64
 3   Tenure in Months                   5538 non-null   float64
 4   Avg Monthly Long Distance Charges  5538 non-null   float64
 5   Internet Type                      5538 non-null   float64
 6   Avg Monthly GB Download            5538 non-null   float64
 7   Monthly Charge                     5538 non-null   float64
 8   Total Charges                      5538 non-null   float64
 9   Total Refunds                      5538 non-null   float64
 10  Total Extra Data Charges           5538 non-null   float64
 11  Total Long Distance Charges        5538 non-null   fl

In [17]:
X_train

Unnamed: 0,Age,Number of Dependents,Number of Referrals,Tenure in Months,Avg Monthly Long Distance Charges,Internet Type,Avg Monthly GB Download,Monthly Charge,Total Charges,Total Refunds,...,Premium Tech Support_Yes,Streaming TV_Yes,Streaming Movies_Yes,Streaming Music_Yes,Unlimited Data_Yes,Contract_One Year,Contract_Two Year,Paperless Billing_Yes,Payment Method_Credit Card,Payment Method_Mailed Check
4044,1.936083,-0.489690,-0.651232,1.567675,-1.175492,1.347971,-0.999584,-1.317644,-0.223864,-0.250568,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
1008,0.864035,-0.489690,0.348544,0.792862,-0.320485,0.265293,-0.853534,0.897381,1.039849,-0.250568,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
3954,1.400059,-0.489690,-0.317974,-1.205341,0.600242,0.265293,0.314869,0.996635,-0.880754,-0.250568,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1696,1.757408,-0.489690,-0.317974,0.303506,-1.297912,0.265293,-0.853534,1.236500,0.800671,-0.250568,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2022,-0.624920,-0.489690,-0.651232,-1.164562,0.070615,0.265293,-0.171966,0.402764,-0.856880,-0.250568,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6391,-0.446246,1.576522,1.015061,1.282217,0.238781,-1.900062,-0.171966,0.043794,0.792588,3.217249,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
399,-0.684478,2.609628,-0.317974,-0.797545,1.108607,1.347971,-0.999584,-1.476450,-0.892923,-0.250568,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1844,-1.458735,-0.489690,-0.651232,-1.083002,-1.277938,-0.817384,0.996437,-0.290362,-0.855975,-0.250568,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2996,1.757408,2.609628,0.015285,-0.185850,-1.481542,-0.817384,0.022768,-0.839569,-0.519866,-0.250568,...,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0


In [18]:
x = sm.add_constant(X_train)
x

  x = pd.concat(x[::order], 1)


Unnamed: 0,const,Age,Number of Dependents,Number of Referrals,Tenure in Months,Avg Monthly Long Distance Charges,Internet Type,Avg Monthly GB Download,Monthly Charge,Total Charges,...,Premium Tech Support_Yes,Streaming TV_Yes,Streaming Movies_Yes,Streaming Music_Yes,Unlimited Data_Yes,Contract_One Year,Contract_Two Year,Paperless Billing_Yes,Payment Method_Credit Card,Payment Method_Mailed Check
4044,1.0,1.936083,-0.489690,-0.651232,1.567675,-1.175492,1.347971,-0.999584,-1.317644,-0.223864,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
1008,1.0,0.864035,-0.489690,0.348544,0.792862,-0.320485,0.265293,-0.853534,0.897381,1.039849,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
3954,1.0,1.400059,-0.489690,-0.317974,-1.205341,0.600242,0.265293,0.314869,0.996635,-0.880754,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1696,1.0,1.757408,-0.489690,-0.317974,0.303506,-1.297912,0.265293,-0.853534,1.236500,0.800671,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2022,1.0,-0.624920,-0.489690,-0.651232,-1.164562,0.070615,0.265293,-0.171966,0.402764,-0.856880,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6391,1.0,-0.446246,1.576522,1.015061,1.282217,0.238781,-1.900062,-0.171966,0.043794,0.792588,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
399,1.0,-0.684478,2.609628,-0.317974,-0.797545,1.108607,1.347971,-0.999584,-1.476450,-0.892923,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1844,1.0,-1.458735,-0.489690,-0.651232,-1.083002,-1.277938,-0.817384,0.996437,-0.290362,-0.855975,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2996,1.0,1.757408,2.609628,0.015285,-0.185850,-1.481542,-0.817384,0.022768,-0.839569,-0.519866,...,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0


In [19]:
x = sm.add_constant(X_train)

#fit regression model
model = sm.OLS(y_train, x).fit()
print(model.summary())
#view summary of model fit

                            OLS Regression Results                            
Dep. Variable:        Customer Status   R-squared:                       0.365
Model:                            OLS   Adj. R-squared:                  0.361
Method:                 Least Squares   F-statistic:                     90.41
Date:                Tue, 29 Nov 2022   Prob (F-statistic):               0.00
Time:                        03:01:19   Log-Likelihood:                -2074.3
No. Observations:                5538   AIC:                             4221.
Df Residuals:                    5502   BIC:                             4459.
Df Model:                          35                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const 

  x = pd.concat(x[::order], 1)


In [20]:
y_pred = model.predict(x)
y_pred

4044    0.150852
1008    0.159139
3954    0.935857
1696    0.632745
2022    0.551812
          ...   
6391   -0.022627
399     0.110931
1844    0.302168
2996    0.215535
5866    0.642357
Length: 5538, dtype: float64

In [21]:
md = model.summary()
md

0,1,2,3
Dep. Variable:,Customer Status,R-squared:,0.365
Model:,OLS,Adj. R-squared:,0.361
Method:,Least Squares,F-statistic:,90.41
Date:,"Tue, 29 Nov 2022",Prob (F-statistic):,0.0
Time:,03:01:20,Log-Likelihood:,-2074.3
No. Observations:,5538,AIC:,4221.0
Df Residuals:,5502,BIC:,4459.0
Df Model:,35,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.8401,0.081,10.426,0.000,0.682,0.998
Age,0.0333,0.006,5.481,0.000,0.021,0.045
Number of Dependents,-0.0465,0.005,-8.773,0.000,-0.057,-0.036
Number of Referrals,-0.0920,0.007,-14.023,0.000,-0.105,-0.079
Tenure in Months,-2.422e-05,0.014,-0.002,0.999,-0.028,0.028
Avg Monthly Long Distance Charges,-0.0204,0.008,-2.401,0.016,-0.037,-0.004
Internet Type,-0.0588,0.013,-4.394,0.000,-0.085,-0.033
Avg Monthly GB Download,-0.0015,0.007,-0.227,0.821,-0.015,0.012
Monthly Charge,0.3147,0.028,11.237,0.000,0.260,0.370

0,1,2,3
Omnibus:,184.237,Durbin-Watson:,1.967
Prob(Omnibus):,0.0,Jarque-Bera (JB):,176.547
Skew:,0.395,Prob(JB):,4.61e-39
Kurtosis:,2.623,Cond. No.,2250000000000000.0


In [22]:
params = model.params
conf = model.conf_int()
conf['Odds Ratio'] = params
conf.columns = ['5%', '95%', 'Odds Ratio']
db = np.exp(conf)


### Linear regression 

In [23]:
# --- Data Set splitting ---
X= df.drop('Total Revenue',axis=1)
y = df['Total Revenue']
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size =0.2)
# print the data
X_train

Unnamed: 0,Gender,Age,Married,Number of Dependents,Number of Referrals,Tenure in Months,Offer,Phone Service,Avg Monthly Long Distance Charges,Multiple Lines,...,Unlimited Data,Contract,Paperless Billing,Payment Method,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Customer Status
1281,Female,23,Yes,3,0,24,Offer C,Yes,46.54,Yes,...,Yes,Two Year,No,Bank Withdrawal,64.35,1558.65,0.00,0,1116.96,0
2664,Male,40,No,0,0,1,,Yes,16.91,No,...,No,Month-to-Month,No,Credit Card,19.65,19.65,0.00,0,16.91,0
4633,Female,34,No,0,0,16,,No,0.00,No,...,Yes,Month-to-Month,Yes,Bank Withdrawal,38.90,664.40,0.00,0,0.00,0
1404,Male,46,Yes,2,2,55,,Yes,24.70,No,...,No,Two Year,No,Bank Withdrawal,19.35,1153.25,0.00,0,1358.50,0
5091,Male,48,No,0,0,1,Offer E,Yes,27.58,No,...,Yes,Month-to-Month,No,Bank Withdrawal,69.95,69.95,0.00,0,27.58,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5466,Male,57,Yes,1,1,5,,Yes,11.54,Yes,...,Yes,Month-to-Month,Yes,Bank Withdrawal,80.15,385.00,0.00,0,57.70,1
1722,Female,38,Yes,3,5,35,,Yes,18.86,Yes,...,No,Two Year,No,Bank Withdrawal,24.30,821.60,0.00,0,660.10,0
4800,Male,64,Yes,0,2,35,,Yes,23.85,Yes,...,No,Month-to-Month,Yes,Credit Card,91.50,3236.35,40.79,90,834.75,0
1944,Female,25,No,1,0,49,,Yes,45.64,No,...,No,One Year,No,Credit Card,19.45,921.30,0.00,0,2236.36,0


In [24]:
X_train['Gender'] = np.where(X_train['Gender'] == "Male", 1,0)
X_train['Married'] = np.where(X_train['Married'] == "Yes", 1,0)
X_train['Offer A'] = np.where(X_train['Offer'] == "Offer A", 1,0)
X_train['Offer B'] = np.where(X_train['Offer'] == "Offer B", 1,0)
X_train['Offer C'] = np.where(X_train['Offer'] == "Offer C", 1,0)
X_train['Offer E'] = np.where(X_train['Offer'] == "Offer E", 1,0)
X_train['Offer D'] = np.where(X_train['Offer'] == "Offer D", 1,0)
X_train['Multiple Lines'] = np.where(X_train['Multiple Lines'] == "Yes", 1,0)
X_train['Cable'] = np.where(X_train['Internet Type'] == "Cable", 1,0)
X_train['Fiber'] = np.where(X_train['Internet Type'] == "Fiber", 1,0)
X_train['Optic'] = np.where(X_train['Internet Type'] == "Optic", 1,0)
X_train['DSL'] = np.where(X_train['Internet Type'] == "DSL", 1,0)
X_train['Online Security'] = np.where(X_train['Online Security'] == "Yes", 1,0)
X_train['Online Backup'] = np.where(X_train['Online Backup'] == "Yes", 1,0)
X_train['Device Protection Plan'] = np.where(X_train['Device Protection Plan'] == "Yes", 1,0)
X_train['Premium Tech Support'] = np.where(X_train['Premium Tech Support'] == "Yes", 1,0)
X_train['Streaming TV'] = np.where(X_train['Streaming TV'] == "Yes", 1,0)
X_train['Streaming Movies'] = np.where(X_train['Streaming Movies'] == "Yes", 1,0)
X_train['Streaming Music'] = np.where(X_train['Streaming Music'] == "Yes", 1,0)
X_train['Streaming Movies'] = np.where(X_train['Streaming Movies'] == "Yes", 1,0)
X_train['Unlimited Data'] = np.where(X_train['Unlimited Data'] == "Yes", 1,0)
X_train['One Year'] = np.where(X_train['Contract'] == "One Year", 1,0)
X_train['Two Year'] = np.where(X_train['Contract'] == "Two Year", 1,0)
X_train['Month-to-Month'] = np.where(X_train['Contract'] == "Month-to-Month", 1,0)
X_train['Paperless Billing'] = np.where(X_train['Paperless Billing'] == "Yes", 1,0)
X_train['Credit Card'] = np.where(X_train['Payment Method'] == "Credit Card", 1,0)
X_train['Bank Withdrawal'] = np.where(X_train['Payment Method'] == "Bank Withdrawal", 1,0)
X_train['Phone Service'] = np.where(X_train['Phone Service'] == "Yes", 1,0)
X_train['Internet Service'] = np.where(X_train['Phone Service'] == "Yes", 1,0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['Gender'] = np.where(X_train['Gender'] == "Male", 1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['Married'] = np.where(X_train['Married'] == "Yes", 1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['Offer A'] = np.where(X_train['Offer'] == "Offer A", 1,0)
A value is t

In [25]:
pd.Series({c: X_train.select_dtypes('object')[c].unique() for c in X_train.select_dtypes('object')})

Offer             [Offer C, None, Offer E, Offer D, Offer B, Off...
Internet Type                         [DSL, No, Fiber Optic, Cable]
Contract                       [Two Year, Month-to-Month, One Year]
Payment Method         [Bank Withdrawal, Credit Card, Mailed Check]
dtype: object

In [26]:
X_train = X_train.drop(['Offer','Internet Type','Contract','Payment Method', 'Total Refunds', 'Total Charges',
                        'Total Extra Data Charges','Total Long Distance Charges'],axis=1)

In [27]:
X_train

Unnamed: 0,Gender,Age,Married,Number of Dependents,Number of Referrals,Tenure in Months,Phone Service,Avg Monthly Long Distance Charges,Multiple Lines,Internet Service,...,Offer D,Cable,Fiber,Optic,DSL,One Year,Two Year,Month-to-Month,Credit Card,Bank Withdrawal
1281,0,23,1,3,0,24,1,46.54,1,0,...,0,0,0,0,1,0,1,0,0,1
2664,1,40,0,0,0,1,1,16.91,0,0,...,0,0,0,0,0,0,0,1,1,0
4633,0,34,0,0,0,16,0,0.00,0,0,...,0,0,0,0,1,0,0,1,0,1
1404,1,46,1,2,2,55,1,24.70,0,0,...,0,0,0,0,0,0,1,0,0,1
5091,1,48,0,0,0,1,1,27.58,0,0,...,0,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5466,1,57,1,1,1,5,1,11.54,1,0,...,0,0,0,0,0,0,0,1,0,1
1722,0,38,1,3,5,35,1,18.86,1,0,...,0,0,0,0,0,0,1,0,0,1
4800,1,64,1,0,2,35,1,23.85,1,0,...,0,0,0,0,0,0,0,1,1,0
1944,0,25,0,1,0,49,1,45.64,0,0,...,0,0,0,0,0,1,0,0,1,0


In [28]:
xvar2 = sm.add_constant(X_train)
linearmoderl1 = sm.OLS(y_train, xvar2).fit()
print(linearmoderl1.summary())

                            OLS Regression Results                            
Dep. Variable:          Total Revenue   R-squared:                       0.917
Model:                            OLS   Adj. R-squared:                  0.916
Method:                 Least Squares   F-statistic:                     1952.
Date:                Tue, 29 Nov 2022   Prob (F-statistic):               0.00
Time:                        03:01:20   Log-Likelihood:                -45056.
No. Observations:                5538   AIC:                         9.018e+04
Df Residuals:                    5506   BIC:                         9.039e+04
Df Model:                          31                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const 

  x = pd.concat(x[::order], 1)


In [29]:
summ = linearmoderl1.summary()
summ

0,1,2,3
Dep. Variable:,Total Revenue,R-squared:,0.917
Model:,OLS,Adj. R-squared:,0.916
Method:,Least Squares,F-statistic:,1952.0
Date:,"Tue, 29 Nov 2022",Prob (F-statistic):,0.0
Time:,03:01:20,Log-Likelihood:,-45056.0
No. Observations:,5538,AIC:,90180.0
Df Residuals:,5506,BIC:,90390.0
Df Model:,31,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1835.4859,60.804,-30.187,0.000,-1954.685,-1716.287
Gender,13.9357,22.326,0.624,0.533,-29.831,57.703
Age,1.2384,0.838,1.478,0.139,-0.404,2.881
Married,70.7013,32.263,2.191,0.028,7.454,133.949
Number of Dependents,-1.5700,12.986,-0.121,0.904,-27.029,23.889
Number of Referrals,-9.7823,5.215,-1.876,0.061,-20.007,0.442
Tenure in Months,82.3960,0.839,98.235,0.000,80.752,84.040
Phone Service,34.9241,53.046,0.658,0.510,-69.066,138.915
Avg Monthly Long Distance Charges,31.8370,0.828,38.472,0.000,30.215,33.459

0,1,2,3
Omnibus:,1.989,Durbin-Watson:,2.048
Prob(Omnibus):,0.37,Jarque-Bera (JB):,1.973
Skew:,0.024,Prob(JB):,0.373
Kurtosis:,3.079,Cond. No.,5.84e+16
