# Logistic Regression
## Question:
**Can the `y` be determined by `x`?**
- Can `profit` be determine by `sales`? (had `profit` is either Yes or No)
- Can `credit` be determined by `score`? (got `credit` is either Yes or No)

## Process
### Importing Libraries

In [14]:
import numpy as np
import pandas as pd
from math import log, exp, floor, ceil
from sklearn import linear_model

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams["text.usetex"] = True

### Acquiring Data

In [2]:
# size of the sample
na = 30
n = 500 + na
# set the random seed generator
np.random.seed(100)

# create `n` samples between 0 and 1200
x = [10 * ceil(120 * np.random.random()) for i in range(n)]
# create `n` binary outcomes
y = ["no" if value < 0 else "yes" for value in np.random.logistic(size = n)]
# correct `x` to have a better logistic regression like
for i in range(n):
    # if `y` is "no"
    if y[i] == "no":
        # if `x` is high, half it
        x[i] = x[i] / 2 if x[i] > 600 else x[i]
    # if `y` is "yes"
    else:
        # if `x` is low, push it higher
        x[i] = x[i] + 500 if x[i] < 600 else x[i]

# introduce some NA
for _ in range(na + 1):
    x[np.random.randint(n)] = None
# create a dataframe
df = pd.DataFrame({"score": x, "approved": y})

### Checking Data, Exploratory Data Analysis (EDA)

In [3]:
df.shape

(530, 2)

In [4]:
df.dtypes

approved     object
score       float64
dtype: object

In [5]:
df.head(10)

Unnamed: 0,approved,score
0,no,330.0
1,no,340.0
2,no,510.0
3,yes,1020.0
4,yes,510.0
5,no,150.0
6,yes,810.0
7,no,500.0
8,yes,670.0
9,no,350.0


In [6]:
df.describe()



Unnamed: 0,score
count,500.0
mean,610.26
std,301.543844
min,10.0
25%,
50%,
75%,
max,1200.0


### Deal with Missing Values

In [7]:
has_null = df.isnull().sum().sum()
print("BEFORE: Had %d nulls in %d cases." % (has_null, df.shape[0]))
if has_null > 0:
    df = df.loc[df["score"].notnull(), ]
has_null = df.isnull().sum().sum()
print("AFTER : Has %d nulls in %d cases." % (has_null, df.shape[0]))

BEFORE: Had 30 nulls in 530 cases.
AFTER : Has 0 nulls in 500 cases.


#### Add Numeric Class Label

In [8]:
df["score"] = df["score"].astype("int64")
df["approved"] = df["approved"].astype("category")
df["grant"] = (df["approved"] == "yes").astype(int)
df.shape

(500, 3)

In [9]:
df.head()

Unnamed: 0,approved,score,grant
0,no,330,0
1,no,340,0
2,no,510,0
3,yes,1020,1
4,yes,510,1


In [10]:
df.describe()

Unnamed: 0,score,grant
count,500.0,500.0
mean,610.26,0.504
std,301.543844,0.500485
min,10.0,0.0
25%,377.5,0.0
50%,580.0,1.0
75%,850.0,1.0
max,1200.0,1.0


### Univariate Analysis

#### Variable: `score`

In [15]:
df["score"].plot(kind = "hist", bins = 20)
plt.show()

RuntimeError: LaTeX was not able to process the following string:
'lp'
Here is the full report generated by LaTeX: 



<matplotlib.figure.Figure at 0x7f4cfa918c10>

#### Variable: `grant`

In [12]:
df["grant"].describe()

count    500.000000
mean       0.504000
std        0.500485
min        0.000000
25%        0.000000
50%        1.000000
75%        1.000000
max        1.000000
Name: grant, dtype: float64

In [13]:
df["grant"].value_counts().plot(kind = 'bar')
plt.show()

RuntimeError: LaTeX was not able to process the following string:
'lp'
Here is the full report generated by LaTeX: 



<matplotlib.figure.Figure at 0x7f4cfaa40bd0>

**Note: Distribution is NOT Normal**

### Bivariate Analysis

#### Variables: `score` and `grant`

### Response by Predict Analysis

In [None]:
X = df[["score"]]
y = df["grant"]

In [None]:
ax = plt.axes()
plt.scatter(X, y)
plt.xlabel("score")
ax.set_xticks([0, 200, 400, 600, 800, 1000, 1200])
plt.ylabel("grant")
ax.set_yticks([0, 1])
plt.show()

### Modelling

#### Trying Linear Regression

In [None]:
df_model = linear_model.LinearRegression()
df_fit = df_model.fit(X, y)
df_fit.intercept_, df_fit.coef_

In [None]:
ax = plt.axes()
reg_x = range(int(floor(df["score"].min())), int(ceil(df["score"].max())) + 1)
reg_y = [df_fit.intercept_ + df_fit.coef_ * v for v in reg_x]
plt.scatter(X, y)
plt.plot(reg_x, reg_y, color = "red", linewidth = 2)
plt.text(100, 1.2, "$grant = %.3f + %.3f \\cdot score$" % (df_fit.intercept_, df_fit.coef_[0]), fontsize = 15)
plt.xlabel("score")
ax.set_xticks([0, 200, 400, 600, 800, 1000, 1200])
plt.ylabel("grant")
ax.set_yticks([0, 1])
plt.show()

In [None]:
value = 590
df_fit.intercept_ + df_fit.coef_[0] * value

In [None]:
value = [100, 990]
df_fit.intercept_ + df_fit.coef_ * value

### Log Odds or Logit Function

$$f(x) = logit(p) = log(odds) = log \left ( {p\over 1-p} \right ) \\ 0 \le p \le 1$$

In [None]:
x = [i / 100.0 for i in range(1, 100)]
y = [log(p / (1 - p)) for p in x]
ax = plt.axes()
plt.plot(x, y)
plt.hlines(0, 0, 1, color = "black", linestyles = "solid")
plt.vlines(0.5, -6, 6, color = "black", linestyles = "solid")
ax.set_xticks([0.0, 0.25, 0.5, 0.75, 1.0])
plt.show()

### Sigmoid Function
**Inverse of the Logit function**
$$f(x) = sigmoid(x) = logit^{-1}(x) = {1 \over 1 + e^{-x}} = {e^x \over 1 + e^x} \\ -\infty \le x \le \infty$$

In [None]:
x = [i / 10.0 for i in range(-50, 50)]
y = [exp(p) / (1 + exp(p)) for p in x]
ax = plt.axes()
plt.plot(x, y)
plt.hlines(0.5, -6, 6, color = "black", linestyles = "solid")
plt.vlines(0, 0, 1, color = "black", linestyles = "solid")
ax.set_yticks([0, 0.5, 1])
plt.show()

In [None]:
X = df[["score"]]
y = df["grant"]

In [None]:
print("Coeficients for LINEAR Regression")
print(df_fit.intercept_, df_fit.coef_)
df_model = linear_model.LogisticRegression()
df_fit = df_model.fit(X, y)
print("Coeficients for LOGISTIC Regression")
print(df_fit.intercept_, df_fit.coef_)

In [None]:
ax = plt.axes()
reg_x = range(int(floor(df["score"].min())), int(ceil(df["score"].max())) + 1)
reg_y = [exp(df_fit.intercept_ + df_fit.coef_ * v) / (1 + exp(df_fit.intercept_ + df_fit.coef_ * v)) for v in reg_x]
plt.scatter(X, y)
plt.plot(reg_x, reg_y, color = "red", linewidth = 2)
plt.text(-30, 0.9, "$grant = %.3f + %.3f \\cdot score$" % (df_fit.intercept_, df_fit.coef_[0]), fontsize = 15)
plt.xlabel("score")
ax.set_xticks([0, 200, 400, 600, 800, 1000, 1200])
plt.ylabel("grant")
ax.set_yticks([0, 1])
plt.show()

In [None]:
value = 590
np.exp(df_fit.intercept_ + df_fit.coef_[0] * value) / (1 + np.exp(df_fit.intercept_ + df_fit.coef_[0] * value))

In [None]:
value = [100, 990]
np.exp(df_fit.intercept_ + df_fit.coef_[0] * value) / (1 + np.exp(df_fit.intercept_ + df_fit.coef_[0] * value))

$$p(y)={e^{\alpha + \beta_1 x_1}\over 1 +e^{\alpha + \beta_1 x_1}}$$

In [None]:
def prob(x):
    return exp(x) / (1 + exp(x))

x = [i / 10.0 for i in range(-50, 50)]
y = [prob(value) for value in x]
ax = plt.axes()
plt.plot(x, y)
ax.set_yticks([0, 0.5, 1])
plt.show()

$$odds(p(y))={p(y)\over 1-p(y)} ={{e^{\alpha + \beta_1 x_1}\over 1 +e^{\alpha + \beta_1 x_1}} \over 1 - \left ({e^{\alpha + \beta_1 x_1}\over 1 +e^{\alpha + \beta_1 x_1}}\right)}$$

In [None]:
def odds(x):
    return prob(x) / (1 - prob(x))

x = [i / 10.0 for i in range(-50, 50)]
y = [odds(value) for value in x]
plt.plot(x, y)
plt.show()

$$log(odds(p(y)))=\alpha + \beta_1 x_1$$

In [None]:
def logit(x):
    return log(odds(x))

x = [i / 10.0 for i in range(-50, 50)]
y = [logit(value) for value in x]
plt.plot(x, y)
plt.show()