In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_excel("PurchaseBike.xlsx")
df.info()
df.head()

# removing the irrelevant columns
cols_to_drop = ["ID"]
df = df.drop(columns=cols_to_drop,axis=1)
# first five rows of dataframe after removing columns
df.head()

numerical_columns = [col for col in df.columns if (df[col].dtype=='int64' or df[col].dtype=='float64') and col != 'Purchased Bike']
numerical_columns

df[numerical_columns].describe().loc[['min', 'mean', '50%', 'max'], :]

### Variável Dependente: Purchase Bike -> transformar em 1 - Yes / 0 - No
### Variáveis Independentes contínua: Income (Renda), Age (Idade), Cars (Número de Carros) e Children (Número de Filhos);
###                                    -> verificar a distribuição de frequência e avaliar se transforma em 
###                                       variáveis discretas
### Variáveis Independentes discretas: com 2 opções para transformar em binárias: Marital Status / Gender e Home Owner
### Demais variávei Independentes discretas: Education, Occupation, Region e Commute Distance 
###                                    -> utilizar a função dummies para ajuste

### Variável Dependente: Purchase Bike
df['Purchased Bike'] = df['Purchased Bike'].apply(lambda x: 1 if x == 'Yes' else 0)
df

### Variáveis Independentes discretas:
df['Marital Status'] = df['Marital Status'].apply(lambda x: 1 if x == 'Married' else 0)
df['Gender'] = df['Gender'].apply(lambda x: 1 if x == 'Male' else 0)
df['Home Owner'] = df['Home Owner'].apply(lambda x: 1 if x == 'Yes' else 0)
df

### Variáveis Independentes contínua: Age (Idade);
plt.hist(df["Age"],bins=5)
plt.xlabel("Age")
plt.ylabel("Frequency")

df["Age"] = pd.cut(df["Age"],bins=[30,40,50,60,70,100])
df

### Variáveis Independentes contínua: Income (Renda);
plt.hist(df["Income"],bins=20)
plt.xlabel("Income")
plt.ylabel("Frequency")

df["Income"] = pd.cut(df["Income"],bins=[10000,50000,90000,120000,200000])
df

### Variáveis Independentes contínua: Cars (Número de Carros);
plt.hist(df["Cars"],bins=10)
plt.xlabel("Cars")
plt.ylabel("Frequency")

### Variáveis Independentes contínua: Children (Número de Filhos);
plt.hist(df["Children"],bins=6)
plt.xlabel("Children")
plt.ylabel("Frequency")

### Demais variávei Independentes discretas: Education, Occupation, Region e Commute Distance 
df = pd.get_dummies(df)
df

df.info()

df = df.drop(columns=["Income_(120000, 200000]"],axis=1)
df = df.drop(columns=["Education_Partial High School"],axis=1)
df = df.drop(columns=["Occupation_Skilled Manual"],axis=1)
df = df.drop(columns=["Commute Distance_5-10 Miles"],axis=1)
df = df.drop(columns=["Region_Pacific"],axis=1)
df = df.drop(columns=["Age_(70, 100]"],axis=1)
df.info()

### Implementação da rotina de Machine Learning

### Retirando a variável dependente da base que será trabalhada
feat = df.drop(columns=['Purchased Bike'],axis=1)
label = df["Purchased Bike"]

### Bases para treino (train) e aplicação (test) - separando em 70% para treino e 30% para aplicação
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(feat, label, test_size=0.3)

### Rotina de normalização da base de dados para o Machine Learning - utiliza zcores
from sklearn.preprocessing import StandardScaler

sc_x = StandardScaler()
X_train = sc_x.fit_transform(X_train)
X_test = sc_x.fit_transform(X_test)

### Rotina de Machine Learning
from sklearn.svm import SVC

support_vector_classifier = SVC(kernel='rbf')
support_vector_classifier.fit(X_train,y_train)
y_pred_svc = support_vector_classifier.predict(X_test)

from sklearn.metrics import confusion_matrix

cm_support_vector_classifier = confusion_matrix(y_test,y_pred_svc)
print(cm_support_vector_classifier,end='\n\n')

numerator = cm_support_vector_classifier[0][0] + cm_support_vector_classifier[1][1]
denominator = sum(cm_support_vector_classifier[0]) + sum(cm_support_vector_classifier[1])
acc_svc = (numerator/denominator) * 100
print("Accuracy : ",round(acc_svc,2),"%")

df = pd.read_excel("PurchaseBike.xlsx")
df.info()
df.head()

df = df[['Cars', 'Commute Distance', 'Age', 'Marital Status', 'Education', 'Region', 'Purchased Bike']]
df

### Variável Dependente: Purchase Bike
df['Purchased Bike'] = df['Purchased Bike'].apply(lambda x: 1 if x == 'Yes' else 0)
df

df["Age"] = pd.cut(df["Age"],bins=[30,45,60,100])
df

### Variáveis Independentes discretas:
df['Marital Status'] = df['Marital Status'].apply(lambda x: 1 if x == 'Married' else 0)
df

### Demais variávei Independentes discretas: Education, Region e Commute Distance 
df = pd.get_dummies(df)
df

df.info()

df = df.drop(columns=["Education_Partial High School"],axis=1)
df = df.drop(columns=["Commute Distance_5-10 Miles"],axis=1)
df = df.drop(columns=["Region_Pacific"],axis=1)
df = df.drop(columns=["Age_(60, 100]"],axis=1)
df.info()

df

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

def machine_learning_svc(df_ml, var_dep):
    X_Train = df_ml.drop(columns=[var_dep], axis=1)
    X_Test = df_ml.drop(columns=[var_dep], axis=1)
    y_Train = df_ml[var_dep]
    y_Test = df_ml[var_dep]
    sc_x = StandardScaler()
    X_Train = sc_x.fit_transform(X_Train)
    X_Test = sc_x.fit_transform(X_Test)
    support_vector_classifier = SVC(kernel='rbf')
    support_vector_classifier.fit(X_Train, y_Train)
    pred_svc = support_vector_classifier.predict(X_Test)
    return pred_svc

# executa a rotina de previsão pelo método SVC (Support Vector Classifier) 
y_pred_svc = machine_learning_svc(df, 'Purchased Bike')
df['ml_predict'] = y_pred_svc
df

df = pd.read_excel("PurchaseBike.xlsx")
df.info()
df.head()

df['Purchased Bike'] = df['Purchased Bike'].apply(lambda x: 1 if x == 'Yes' else 0)
df['ml_predict'] = y_pred_svc
df

df_mis = df[(df['Purchased Bike'] != df['ml_predict'])]
df_mis



