In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,mean_absolute_error

In [4]:
df = pd.read_csv('Admission_Prediction.csv')
df.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337.0,118.0,4.0,4.5,4.5,9.65,1,0.92
1,2,324.0,107.0,4.0,4.0,4.5,8.87,1,0.76
2,3,,104.0,3.0,3.0,3.5,8.0,1,0.72
3,4,322.0,110.0,3.0,3.5,2.5,8.67,1,0.8
4,5,314.0,103.0,2.0,2.0,3.0,8.21,0,0.65


In [5]:
df.drop(columns = ['Serial No.'],inplace=True)

In [6]:
df.isnull().sum()

GRE Score            15
TOEFL Score          10
University Rating    15
SOP                   0
LOR                   0
CGPA                  0
Research              0
Chance of Admit       0
dtype: int64

In [7]:
for i in df.columns:
    if df[i].isna().sum()>0:
        if df[i].dtypes=='float64':
            df[i].fillna(df[i].median(),inplace=True)
        else:
            df[i].fillna(df[i].mode()[0],inplace=True)

In [8]:
df.isna().sum()

GRE Score            0
TOEFL Score          0
University Rating    0
SOP                  0
LOR                  0
CGPA                 0
Research             0
Chance of Admit      0
dtype: int64

In [9]:
X=df.drop(columns=['Chance of Admit'])
Y=df['Chance of Admit']

In [10]:
X

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
0,337.0,118.0,4.0,4.5,4.5,9.65,1
1,324.0,107.0,4.0,4.0,4.5,8.87,1
2,317.0,104.0,3.0,3.0,3.5,8.00,1
3,322.0,110.0,3.0,3.5,2.5,8.67,1
4,314.0,103.0,2.0,2.0,3.0,8.21,0
...,...,...,...,...,...,...,...
495,332.0,108.0,5.0,4.5,4.0,9.02,1
496,337.0,117.0,5.0,5.0,5.0,9.87,1
497,330.0,120.0,5.0,4.5,5.0,9.56,1
498,312.0,103.0,4.0,4.0,5.0,8.43,0


In [11]:
Y

0      0.92
1      0.76
2      0.72
3      0.80
4      0.65
       ... 
495    0.87
496    0.96
497    0.93
498    0.73
499    0.84
Name: Chance of Admit, Length: 500, dtype: float64

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size =0.20,random_state = 120)

In [14]:
lr = LinearRegression()
lr.fit(x_train,y_train)

LinearRegression()

In [15]:
lr.intercept_

-1.2843016615461327

In [16]:
lr.coef_

array([0.00189727, 0.00253558, 0.00564356, 0.00240518, 0.01529286,
       0.12168918, 0.02278335])

In [17]:
y_pred = lr.predict(x_test)
y_pred

array([0.72488031, 0.70331147, 0.76051143, 0.64773312, 0.67306757,
       0.68555365, 0.62452874, 0.53913699, 0.84419651, 0.72149045,
       0.81266231, 0.8138742 , 0.65927199, 0.86202552, 0.69979236,
       0.50831945, 0.82491291, 0.56690895, 0.95493591, 0.53503182,
       0.741139  , 0.86189417, 0.88556975, 0.66135794, 0.73715459,
       0.80776472, 0.67493374, 0.87964657, 0.95732097, 0.65614933,
       0.72978089, 0.61189638, 0.77125541, 0.77046635, 0.84880397,
       0.78077769, 0.82278888, 0.62615412, 0.60274305, 0.77402953,
       0.56448484, 0.79093086, 0.8365031 , 0.61105256, 0.70001207,
       0.66414751, 0.57081826, 0.76120218, 0.86184591, 0.94770548,
       0.83224381, 0.74894362, 0.71990594, 0.70644387, 0.52549286,
       0.8322867 , 0.73395195, 0.85372648, 0.68561199, 0.73148983,
       1.00162301, 0.78759002, 0.61284177, 0.92766986, 0.8507034 ,
       0.88666255, 0.84987946, 0.73302878, 0.65453768, 0.72011466,
       0.82601118, 0.64154275, 0.64326627, 0.64789772, 0.54626

In [18]:
mse = mean_squared_error(y_test,y_pred)
mse

0.003312886031514512

In [19]:
mae = mean_absolute_error(y_test,y_pred)
mae

0.037757422710854624

In [20]:
from sklearn.metrics import r2_score

In [21]:
r2 = r2_score(y_test,y_pred)
r2

0.8039353922394976

# Lasso Regularization

In [23]:
from sklearn.linear_model  import Ridge,Lasso,RidgeCV, LassoCV, ElasticNet, ElasticNetCV

In [25]:
lasscv = LassoCV(alphas = None, cv = 10, max_iter=100000,normalize=True)
lasscv.fit(x_train,y_train)

LassoCV(cv=10, max_iter=100000, normalize=True)

In [26]:
alpha = lasscv.alpha_
alpha

1.6866548359236376e-05

In [27]:
lasso_reg = Lasso(alpha)
lasso_reg.fit(x_train,y_train)

Lasso(alpha=1.6866548359236376e-05)

In [34]:
lasso_reg.score(x_test,y_test)

0.8039024496399482

# Ridge

In [35]:
alphas = np.random.uniform(low=0,high=10,size=(50,))

In [36]:
ridgecv = RidgeCV(alphas = alphas,cv=10,normalize=True)
ridgecv.fit(x_train,y_train)

RidgeCV(alphas=array([5.19764593, 9.33913357, 2.95805639, 0.48239888, 6.14802464,
       7.7932212 , 6.21572138, 5.4194097 , 1.78132944, 5.47629115,
       6.27609716, 8.47884866, 1.96225393, 8.82827439, 0.57774525,
       3.98673481, 6.25875213, 5.9242407 , 2.26590786, 8.829969  ,
       6.13854126, 1.14908459, 2.99785294, 3.405095  , 0.68087228,
       0.57214387, 8.62250038, 5.93082148, 2.85967143, 3.04341552,
       6.37144237, 9.11921533, 0.10073943, 8.18463112, 3.20319528,
       9.95151986, 5.65798299, 6.02382743, 2.05296074, 8.14681464,
       1.78019956, 3.90863368, 4.7718852 , 0.11033218, 3.0640578 ,
       7.60058235, 1.02446998, 6.40708089, 0.94956469, 8.57187369]),
        cv=10, normalize=True)

In [37]:
ridgecv.alpha_

0.1007394259746619

In [38]:
ridge_model = Ridge(ridgecv.alpha_)


In [41]:
ridge_model.fit(x_train,y_train)

Ridge(alpha=0.1007394259746619)

In [42]:
ridge_model.score(x_test,y_test)

0.8039302644234766

# Elastic

In [50]:
elasticcv = ElasticNetCV(alphas=None,cv=10)
elasticcv.fit(x_train,y_train)

ElasticNetCV(cv=10)

In [51]:
elasticcv.alpha_

0.002664798000000001

In [53]:
elatic = ElasticNet(elasticcv.alpha_)

In [54]:
elatic.fit(x_train,y_train)

ElasticNet(alpha=0.002664798000000001)

In [56]:
elatic.score(x_test,y_test)

0.7999298448378893

In [58]:
elasticcv.l1_ratio

0.5

In [59]:
elasticnet_reg = ElasticNet(alpha = elasticcv.alpha_,l1_ratio=0.5)
elasticnet_reg.fit(x_train,y_train)

ElasticNet(alpha=0.002664798000000001)

In [60]:
elasticnet_reg.score(x_test,y_test)

0.7999298448378893

In [61]:
data = pd.read_csv('Advertising.csv',index_col=[0])
data.head()

Unnamed: 0,TV,radio,newspaper,sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [62]:
feature_col = ['TV','radio','newspaper']
x=data[feature_col]
y=data.sales

lm = LinearRegression()
lm.fit(x,y)

print(lm.intercept_)
print(lm.coef_)

2.9388893694594067
[ 0.04576465  0.18853002 -0.00103749]


# OLS :- Ordinary least Square method

In [64]:
import statsmodels.formula.api as smf

In [70]:
ols = smf.ols(formula='sales ~ TV + radio +newspaper',data=data).fit()
ols.conf_int()
ols.summary()

0,1,2,3
Dep. Variable:,sales,R-squared:,0.897
Model:,OLS,Adj. R-squared:,0.896
Method:,Least Squares,F-statistic:,570.3
Date:,"Tue, 28 Jun 2022",Prob (F-statistic):,1.58e-96
Time:,17:09:42,Log-Likelihood:,-386.18
No. Observations:,200,AIC:,780.4
Df Residuals:,196,BIC:,793.6
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.9389,0.312,9.422,0.000,2.324,3.554
TV,0.0458,0.001,32.809,0.000,0.043,0.049
radio,0.1885,0.009,21.893,0.000,0.172,0.206
newspaper,-0.0010,0.006,-0.177,0.860,-0.013,0.011

0,1,2,3
Omnibus:,60.414,Durbin-Watson:,2.084
Prob(Omnibus):,0.0,Jarque-Bera (JB):,151.241
Skew:,-1.327,Prob(JB):,1.44e-33
Kurtosis:,6.332,Cond. No.,454.0


In [71]:
ols1 = smf.ols(formula='sales ~ TV+radio',data=data).fit()
ols1.rsquared

0.8971942610828957

In [72]:
ols2 = smf.ols(formula='sales ~ TV+radio+newspaper',data=data).fit()
ols2.rsquared

0.8972106381789522

# Handling Categorical Predictors with Two Categories

In [82]:
nums = np.random.rand(len(data))
mask_large = nums > 0.5

In [83]:
data['Scale'] = 'small'
data.loc[mask_large, 'Scale'] = 'large'
data.head()

Unnamed: 0,TV,radio,newspaper,sales,Scale
1,230.1,37.8,69.2,22.1,large
2,44.5,39.3,45.1,10.4,large
3,17.2,45.9,69.3,9.3,large
4,151.5,41.3,58.5,18.5,large
5,180.8,10.8,58.4,12.9,small


In [84]:
data['islarge']=data.Scale.map({'small':0,'large':1})
data.head()

Unnamed: 0,TV,radio,newspaper,sales,Scale,islarge
1,230.1,37.8,69.2,22.1,large,1
2,44.5,39.3,45.1,10.4,large,1
3,17.2,45.9,69.3,9.3,large,1
4,151.5,41.3,58.5,18.5,large,1
5,180.8,10.8,58.4,12.9,small,0


In [87]:
# create X and y
feature_cols = ['TV', 'radio', 'newspaper', 'islarge']
X = data[feature_cols]
y = data.sales

# instantiate, fit
lm = LinearRegression()
lm.fit(X, y)

# print coefficients
i=0
for col in feature_cols:
    print('The Coefficient of ',col, ' is: ',lm.coef_[i])
    print('the intercept of ',col,lm.intercept_)
    i=i+1

The Coefficient of  TV  is:  0.04574848366433905
the intercept of  TV 2.9552162596171705
The Coefficient of  radio  is:  0.18851194376925454
the intercept of  radio 2.9552162596171705
The Coefficient of  newspaper  is:  -0.0009466760179824221
the intercept of  newspaper 2.9552162596171705
The Coefficient of  islarge  is:  -0.03135536466393638
the intercept of  islarge 2.9552162596171705


# Handling Categorical variables with More than Two Categories

In [88]:
# set a seed for reproducibility
np.random.seed(123456)

# assign roughly one third of observations to each group
nums = np.random.rand(len(data))
mask_suburban = (nums > 0.33) & (nums < 0.66)
mask_urban = nums > 0.66
data['Targeted Geography'] = 'rural'
data.loc[mask_suburban, 'Targeted Geography'] = 'suburban'
data.loc[mask_urban, 'Targeted Geography'] = 'urban'
data.head()

Unnamed: 0,TV,radio,newspaper,sales,Scale,islarge,Targeted Geography
1,230.1,37.8,69.2,22.1,large,1,rural
2,44.5,39.3,45.1,10.4,large,1,urban
3,17.2,45.9,69.3,9.3,large,1,rural
4,151.5,41.3,58.5,18.5,large,1,urban
5,180.8,10.8,58.4,12.9,small,0,suburban


In [89]:
# create three dummy variables using get_dummies, then exclude the first dummy column
area_dummies = pd.get_dummies(data['Targeted Geography'], prefix='Targeted Geography').iloc[:,1:]

# concatenate the dummy variable columns onto the original DataFrame (axis=0 means rows, axis=1 means columns)
data = pd.concat([data, area_dummies], axis=1)
data.head()

Unnamed: 0,TV,radio,newspaper,sales,Scale,islarge,Targeted Geography,Targeted Geography_suburban,Targeted Geography_urban
1,230.1,37.8,69.2,22.1,large,1,rural,0,0
2,44.5,39.3,45.1,10.4,large,1,urban,0,1
3,17.2,45.9,69.3,9.3,large,1,rural,0,0
4,151.5,41.3,58.5,18.5,large,1,urban,0,1
5,180.8,10.8,58.4,12.9,small,0,suburban,1,0


In [91]:
# create X and y
feature_cols = ['TV', 'radio', 'newspaper', 'islarge', 'Targeted Geography_suburban', 'Targeted Geography_urban']
X = data[feature_cols]
y = data.sales

# instantiate, fit
lm = LinearRegression()
lm.fit(X, y)

# print coefficients
print(feature_cols, lm.coef_)

['TV', 'radio', 'newspaper', 'islarge', 'Targeted Geography_suburban', 'Targeted Geography_urban'] [ 0.04579306  0.18760948 -0.00096888 -0.01742426 -0.1189193   0.25177098]


# Polynomial Regression

not use much but we have to know what is polynomial regression