In [1]:
import pandas as pd
import numpy as np

### Data preparation

In [3]:
data = pd.read_csv(r'data.csv')
data.head()
#data.describe()

Unnamed: 0,age,occupation,marital_status,education_level,airtime_balance,physical_equity,call_duration,joined
0,31.0,blue-collar,single,secondary,-8.06,yes,3,no
1,58.0,technician,single,secondary,86.0,yes,1,no
2,50.0,clerk,married,secondary,5.12,yes,1,no
3,59.0,retired,married,secondary,1.84,no,3,no
4,46.0,management,married,,8.76,yes,0,no


### 

Recoding the variables

In [4]:
#Occupation
occupation_codes = [1,2,3,4,5,6,7,8,9,10]
occupation_categories = ["unemployed", "clerk", "management", "housemaid", "student", 
                         "blue-collar", "self-employed", "retired", "technician", "services"]


#Marital
marital_codes = [1,2,3]
marital_categories = ["single", "married", "divorced"]

#Education
education_codes = [1,2,3,4]
education_categories = ["primary", "secondary", "university", "unknown"]

#yes/no questions
y_n_code = [0,1] 
y_n_name = ["no","yes"]

data['education_level1'] = data['education_level'].replace(education_categories, education_codes)
data['occupation1'] = data['occupation'].replace(occupation_categories, occupation_codes)
data['marital_status1'] = data['marital_status'].replace(marital_categories, marital_codes)
data['physical_equity1'] = data['physical_equity'].replace(y_n_name, y_n_code)
data['joined1'] = data['joined'].replace(y_n_name, y_n_code)

#Create a dataframe to work with - test data
data = data[['age','occupation1','marital_status1','education_level1','airtime_balance',
               'physical_equity1','call_duration','joined1']]
data.head(2)

Unnamed: 0,age,occupation1,marital_status1,education_level1,airtime_balance,physical_equity1,call_duration,joined1
0,31.0,6.0,1.0,2.0,-8.06,1,3,0
1,58.0,9.0,1.0,2.0,86.0,1,1,0


### 

Dealing with missing data

In [5]:
#Dealing with missing data -- Using Imputation (Mean as a strategy)
from sklearn.impute import SimpleImputer
most_frequent = data.copy()
#setting strategy to 'mean' to impute by the mean
mean_imputer = SimpleImputer(strategy='mean')# strategy can also be mean or median 
most_frequent.iloc[:,:] = mean_imputer.fit_transform(most_frequent)
data = most_frequent.iloc[:,:]
data.isnull().sum()

age                 0
occupation1         0
marital_status1     0
education_level1    0
airtime_balance     0
physical_equity1    0
call_duration       0
joined1             0
dtype: int64

### Using the scikit-learn package

In [17]:
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

#Splitting data
X = data[['age','occupation1','marital_status1','education_level1','physical_equity1','airtime_balance','joined1']]
y = data[['call_duration']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1) 
#Random state number can be any number. It is used to ensure that the subsets of data remain the same in case we need to use them again (e.g., when comparing different algorithms) 

#Running the linear regression
from sklearn.linear_model import LinearRegression

# Train the model using the training sets
model = LinearRegression().fit(X_train, y_train) #Notice we only use the training data at this stage

print(model.score(X_test, y_test))

0.14533493589579993


### Using the statsmodel package 

In [2]:
import statsmodels.api as sm