# Logistic Regression

In [None]:
# # # Logistic Regression Classifier
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load dataset
df = pd.read_csv('data/diabetes.csv')
print(df.head().to_string())

# # Check data description
print(df.describe(include='all').to_string())

# # Splitting the dataset into features (input) and target (output, label)
X = df.loc[:, df.columns != 'Outcome']
y = df['Outcome']

# # Logistic Regression Classifier training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# # Test prediction
y_pred = log_reg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

# Naive Bayes

In [ ]:
# # # Naive Bayes Classifier
import pandas as pd
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load dataset
df = pd.read_csv('data/loan_data.csv')
print(df.head().to_string())

# # Check data description
print(df.describe(include='all').to_string())

# # Handling Categorical Data using LabelEncoder
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
mapping = {}
for i in df.columns:
    if df[i].dtypes == 'object':
        df[i] = label_encoder.fit_transform(df[i])
        # add to mapping
        mapping[i] = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(mapping)

# # Splitting the dataset into features (input) and target (output, label)
X = df.loc[:, df.columns != 'not.fully.paid']
y = df['not.fully.paid']

# # Naive Bayes Classifier training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
nb = BernoulliNB()
nb.fit(X_train, y_train)

# # Test prediction
y_pred = nb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

# Decision Tree

In [ ]:
# # DecisionTreeClassifier
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import export_graphviz
from graphviz import Source

# # Load dataset
df = pd.read_csv('data/car_evaluation.csv')
print(df.head().to_string())

# # Check data description
print(df.describe(include='all').to_string())

# # Handling Categorical Data using LabelEncoder
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

mapping = {}
for i in df.columns:
    df[i] = label_encoder.fit_transform(df[i])
    # add to mapping
    mapping[i] = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(mapping)

# # Splitting the dataset into features (input) and target (output, label)
X = df[['BuyingPrice', 'MaintenancePrice', 'Doors', 'PersonsCapacity', 'LuggageBoot', 'Safety']]
y = df['CarEvaluation']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# # DecisionTreeClassifier training
tree_clf = DecisionTreeClassifier(criterion='entropy', max_depth=3)
tree_clf.fit(X_train, y_train)

# # Plot tree_clf
export_graphviz(tree_clf,
      out_file='car_evaluation.dot',
      feature_names=['BuyingPrice', 'MaintenancePrice', 'Doors', 'PersonsCapacity', 'LuggageBoot', 'Safety'],
      class_names=['acceptable', 'good', 'unacceptable', 'very good'],
      rounded=True, filled=True, impurity=True, proportion=True, special_characters=True)

# download and install graphviz from https://graphviz.gitlab.io/_pages/Download/Download_windows.html
Source.from_file('car_evaluation.dot', format='png').view()

# # Test prediction
y_pred = tree_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

# Manual Test
# BuyingPrice, MaintenancePrice, Doors, PersonsCapacity, LuggageBoot, Safety, CarEvaluation
test = ['low', 'low', '5more', 'more', 'big', 'high']
X_test = []

# convert test data to numerical using mapping from label encoder
for i in X.columns:
    X_test.append(list(mapping[i].values())[list(mapping[i].keys()).index(test[X.columns.get_loc(i)])])

result = tree_clf.predict([X_test])
print("result: ", list(mapping['CarEvaluation'].keys())[list(mapping['CarEvaluation'].values()).index(result)])