<a href="https://colab.research.google.com/github/manoharpavuluri/Basic_Linear_Logistic_Regression/blob/main/Basic_Linear_Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# *Linear Regression with statsmodels*

In [1]:
import pandas as pd
import numpy as np
import matplotlib as matlab
import statsmodels

# **Data**

In [2]:
air = pd.read_csv("https://raw.githubusercontent.com/venkatareddykonasani/Datasets/master/AirPassengers/AirPassengers.csv")
air.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Week_num                      80 non-null     int64  
 1   Passengers                    80 non-null     int64  
 2   Promotion_Budget              80 non-null     int64  
 3   Service_Quality_Score         80 non-null     float64
 4   Holiday_week                  80 non-null     object 
 5   Delayed_Cancelled_flight_ind  80 non-null     object 
 6   Inter_metro_flight_ratio      80 non-null     float64
 7   Bad_Weather_Ind               80 non-null     object 
 8   Technical_issues_ind          80 non-null     object 
dtypes: float64(2), int64(3), object(4)
memory usage: 5.8+ KB


In [3]:
air.describe()

Unnamed: 0,Week_num,Passengers,Promotion_Budget,Service_Quality_Score,Inter_metro_flight_ratio
count,80.0,80.0,80.0,80.0,80.0
mean,40.5,44745.025,625422.3,2.781168,0.693
std,23.2379,11267.155215,156514.3,1.000204,0.158045
min,1.0,28700.0,365680.0,0.85536,0.38
25%,20.75,36424.0,517891.5,1.93158,0.61
50%,40.5,43144.0,597919.0,3.10812,0.72
75%,60.25,48864.0,688593.5,3.608882,0.8025
max,80.0,81228.0,1108254.0,4.0,0.9


# **Find Relationship**

In [4]:
# Correlation between Passengers & Promotion_Budget
np.corrcoef(air.Passengers,air.Promotion_Budget)

array([[1.        , 0.96585103],
       [0.96585103, 1.        ]])

# **Finding the slope/coeff and Constant**

# **Single Regression model Coeff and intercept**

In [5]:
import statsmodels.formula.api as sm


In [6]:
model = sm.ols(formula='Passengers ~ Promotion_Budget', data= air)
fitted1 = model.fit()
fitted1.summary()

0,1,2,3
Dep. Variable:,Passengers,R-squared:,0.933
Model:,OLS,Adj. R-squared:,0.932
Method:,Least Squares,F-statistic:,1084.0
Date:,"Tue, 29 Oct 2024",Prob (F-statistic):,1.66e-47
Time:,19:10:05,Log-Likelihood:,-751.34
No. Observations:,80,AIC:,1507.0
Df Residuals:,78,BIC:,1511.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1259.6058,1361.071,0.925,0.358,-1450.078,3969.290
Promotion_Budget,0.0695,0.002,32.923,0.000,0.065,0.074

0,1,2,3
Omnibus:,26.624,Durbin-Watson:,1.831
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5.188
Skew:,-0.128,Prob(JB):,0.0747
Kurtosis:,1.779,Cond. No.,2670000.0


In [7]:
# above you can see constant/intercept (c) is 1259.60 while the slope(m) is 0.0695

# *Linear Regression with sklearn*



In [8]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(air[["Passengers"]],air[["Promotion_Budget"]])

In [9]:
print("coef: ", lr.coef_)
print("slope: ", lr.intercept_)

coef:  [[13.41683347]]
slope:  [25085.80076924]


# **Multiple Regression model Coeff and intercept**

In [10]:
model = sm.ols(formula='Passengers ~ Promotion_Budget + Inter_metro_flight_ratio + Service_Quality_Score', data= air)
fitted1 = model.fit()
fitted1.summary()

0,1,2,3
Dep. Variable:,Passengers,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.949
Method:,Least Squares,F-statistic:,495.6
Date:,"Tue, 29 Oct 2024",Prob (F-statistic):,8.710000000000001e-50
Time:,19:10:08,Log-Likelihood:,-738.45
No. Observations:,80,AIC:,1485.0
Df Residuals:,76,BIC:,1494.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.921e+04,3542.694,5.424,0.000,1.22e+04,2.63e+04
Promotion_Budget,0.0555,0.004,15.476,0.000,0.048,0.063
Inter_metro_flight_ratio,-2003.4508,2129.095,-0.941,0.350,-6243.912,2237.010
Service_Quality_Score,-2802.0708,530.382,-5.283,0.000,-3858.419,-1745.723

0,1,2,3
Omnibus:,6.902,Durbin-Watson:,2.312
Prob(Omnibus):,0.032,Jarque-Bera (JB):,2.759
Skew:,-0.051,Prob(JB):,0.252
Kurtosis:,2.096,Cond. No.,8220000.0


# *Logistic Regression*

In [11]:
from sklearn.linear_model import  LogisticRegression
l = LogisticRegression()
l

In [12]:
sales=pd.read_csv("https://raw.githubusercontent.com/venkatareddykonasani/Datasets/master/Product%20Sales%20Data/Product_sales.csv")


In [13]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 467 entries, 0 to 466
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Age     467 non-null    int64
 1   Bought  467 non-null    int64
dtypes: int64(2)
memory usage: 7.4 KB


In [14]:
sales.describe()

Unnamed: 0,Age,Bought
count,467.0,467.0
mean,29.098501,0.438972
std,21.769248,0.496794
min,1.0,0.0
25%,10.0,0.0
50%,18.0,0.0
75%,51.0,1.0
max,65.0,1.0


In [15]:
l.fit(sales[["Age"]],sales[["Bought"]])
print("coef: ", l.coef_)
print("slope: ", l.intercept_)

coef:  [[0.21777169]]
slope:  [-6.90722429]


  y = column_or_1d(y, warn=True)


In [16]:
d1= pd.DataFrame({"Age": [4]})
predictLR = l.predict(d1)
predictLR



array([0])

In [17]:

d2= pd.DataFrame({"Age": [1054]})
predictLR = l.predict(d2)
predictLR

array([1])

# *Logistic Regression the multiple features*

In [18]:
fiber=pd.read_csv("https://raw.githubusercontent.com/venkatareddykonasani/Datasets/master/Fiberbits/Fiberbits_v1.csv")

In [19]:
fiber.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column                      Non-Null Count   Dtype
---  ------                      --------------   -----
 0   active_cust                 100000 non-null  int64
 1   income                      100000 non-null  int64
 2   months_on_network           100000 non-null  int64
 3   Num_complaints              100000 non-null  int64
 4   number_plan_changes         100000 non-null  int64
 5   relocated                   100000 non-null  int64
 6   monthly_bill                100000 non-null  int64
 7   technical_issues_per_month  100000 non-null  int64
 8   Speed_test_result           100000 non-null  int64
dtypes: int64(9)
memory usage: 6.9 MB


In [20]:
lrm = LogisticRegression()
lrm.fit(fiber[["income"] + ["months_on_network"] + ["Num_complaints"] + ["number_plan_changes"] + ["relocated"] + ["monthly_bill"] + ["technical_issues_per_month"] + ["Speed_test_result"]], fiber[["active_cust"]])

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import  confusion_matrix


# **Applying Model on Entire Data**

---



In [27]:
predict1=lrm.predict(fiber[["income"]+['months_on_network']+['Num_complaints']+['number_plan_changes']+['relocated']+['monthly_bill']+['technical_issues_per_month']+['Speed_test_result']])
predict1

array([1, 1, 0, ..., 1, 1, 1])

In [28]:
lrm_cm1 = confusion_matrix(fiber[["active_cust"]], predict1)
print(lrm_cm1)

[[33872  8269]
 [ 6168 51691]]


In [29]:
##### from confusion matrix calculate accuracy
print(  sum(sum(lrm_cm1)) )

100000


In [30]:
accuracy1 = (lrm_cm1[0,0]+lrm_cm1[1,1]) / sum(sum(lrm_cm1))
accuracy1

0.85563

# **Applying Model on SubSet of Data using Test_Train_Split**

In [37]:
train_data,test_data = train_test_split(fiber, test_size=0.2, shuffle=True)

In [38]:
print("fiber_data ", fiber.shape)
print("train_data ", train_data.shape)
print("test_data ", test_data.shape)

fiber_data  (100000, 9)
train_data  (80000, 9)
test_data  (20000, 9)


In [39]:
lrm.fit(train_data[["income"] + ["months_on_network"] + ["Num_complaints"] + ["number_plan_changes"] + ["relocated"] + ["monthly_bill"] + ["technical_issues_per_month"] + ["Speed_test_result"]], train_data[["active_cust"]])
predict2=lrm.predict(test_data[["income"]+['months_on_network']+['Num_complaints']+['number_plan_changes']+['relocated']+['monthly_bill']+['technical_issues_per_month']+['Speed_test_result']])
predict2

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([1, 1, 1, ..., 1, 1, 0])

In [40]:
lrm_cm2 = confusion_matrix(test_data[["active_cust"]], predict2)
print(lrm_cm2)

[[ 6783  1583]
 [ 1288 10346]]


In [41]:
accuracy2 = (lrm_cm2[0,0]+lrm_cm2[1,1]) / sum(sum(lrm_cm2))
accuracy2

0.85645