<a href="https://colab.research.google.com/github/ozturkgizem/Agriculture/blob/main/log_reg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Logistic Regression**

1. Exploratory Data Analysis
2. Data Preprocessing
3. Model & Prediction
4. Model Evaluation
5. Model Validation: Holdout
6. Model Validation: 10-Fold Cross Validation
7. Prediction for A New Observation

In [55]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn

from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report, RocCurveDisplay
from sklearn.model_selection import train_test_split, cross_validate

In [5]:
import warnings
warnings.simplefilter('ignore')

**EDA**

In [6]:
df= pd.read_csv("Crop_recommendation.csv")

In [7]:
def outlier_thresholds(dataframe, col_name, q1=0.05, q3=0.95):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False

def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

In [None]:
# BİZDE BU GRAPLAR VAR
# Target'ın Analizi

df["label"].value_counts()
sns.countplot(x="label", data=df)
plt.show()
100 * df["label"].value_counts() / len(df)

# Feature'ların Analizi
df.head()

def plot_numerical_col(dataframe, numerical_col):
    dataframe[numerical_col].hist(bins=20)
    plt.xlabel(numerical_col)
    plt.show(block=True)

for col in df.columns:
    plot_numerical_col(df, col)

cols = [col for col in df.columns if "label" not in col]

# for col in cols:
#     plot_numerical_col(df, col)

df.describe().T

In [None]:
#  BU DA VAR
# Target vs Features

def target_summary_with_num(dataframe, target, numerical_col):
    print(dataframe.groupby(target).agg({numerical_col: "mean"}), end="\n\n\n")

for col in cols:
    target_summary_with_num(df, "label", col)

Data Preprocessing

In [None]:
df.shape
df.head()

df.isnull().sum()

df.describe().T

In [None]:
for col in cols:
    print(col, check_outlier(df, col))

# replace_with_thresholds(df, "değisken_adı")

In [None]:
for col in cols:
    df[col] = RobustScaler().fit_transform(df[[col]])

df.head()

Model Prediction

In [14]:
y = df["label"]
X = df.drop(["label"], axis=1)

log_model = LogisticRegression().fit(X, y)

In [None]:
log_model.intercept_
log_model.coef_

y_pred = log_model.predict(X)

y_pred[0:10]

y[0:10]

**Model Evaluation**

In [16]:
def plot_confusion_matrix(y, y_pred):
    acc = round(accuracy_score(y, y_pred), 2)
    cm = confusion_matrix(y, y_pred)
    sns.heatmap(cm, annot=True, fmt=".0f")
    plt.xlabel('y_pred')
    plt.ylabel('y')
    plt.title('Accuracy Score: {0}'.format(acc), size=10)
    plt.show()

In [None]:
plot_confusion_matrix(y, y_pred)
print(classification_report(y, y_pred))

In [None]:
# ROC AUC- colabde çalışmadı
y_prob = log_model.predict_proba(X)[:, 1]
y_prob
roc_auc_score(y, y_prob)

In [26]:
#ROC-AUC
y_prob = log_model.predict_proba(X)
roc_auc = roc_auc_score(y ,y_prob, multi_class='ovr')
print(roc_auc)

0.9996820346320348


**Model Validation: Holdout**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.20, random_state=17)

log_model = LogisticRegression().fit(X_train, y_train)

y_pred = log_model.predict(X_test)
y_prob = log_model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))

In [None]:
pip install sklearn.metrics==1.0

In [None]:
plot_roc_curve(log_model, X_test, y_test)
plt.title('ROC Curve')
plt.plot([0, 1], [0, 1], 'r--')
plt.show()

In [79]:
# AUC
roc_auc_score(y, y_prob, multi_class='ovr')

AxisError: ignored

**Model Validation: 10-Fold Cross Validation**

In [81]:
y = df["label"]
X = df.drop(["label"], axis=1)

log_model = LogisticRegression().fit(X, y)

cv_results = cross_validate(log_model,
                            X, y,
                            cv=5,
                            scoring=["accuracy", "precision", "recall", "f1", "roc_auc"])

In [85]:
print(cv_results['test_accuracy'].mean())
print(cv_results['test_precision'].mean())
print(cv_results['test_recall'].mean())
print(cv_results['test_f1'].mean())
print(cv_results['test_roc_auc'].mean())

0.969090909090909
nan
nan
nan
nan



**Prediction for A New Observation**

In [83]:
X.columns

random_user = X.sample(1, random_state=45)
log_model.predict(random_user)

array(['coffee'], dtype=object)