### Nombre: Mauricio Juárez Sánchez
### Matrícula: A01660336
### Diabetes Dataset Analysis


In [403]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

In [404]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [405]:
diabetes = pd.read_csv('/content/drive/MyDrive/Inteligencia artificial avanzada para la ciencia de datos/Tareas/Python libraries/diabetes.csv')

In [406]:
diabetes

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


### Removing outliers

In [407]:
#Removing outliers
from sklearn.neighbors import LocalOutlierFactor
X = diabetes.drop('Outcome',axis = 1)
y = diabetes['Outcome']
lof = LocalOutlierFactor(n_neighbors=10, contamination=0.1)

# Fitting the model to the data
lof.fit(X)

# Getting the outlier scores
outlier_scores = lof.negative_outlier_factor_

# Find the outliers
outliers = X[outlier_scores < -2]

# Remove the outliers from the data
diabetes_no_outliers = diabetes.iloc[(outlier_scores >= -2).nonzero()[0]]

# Display the resulting DataFrame
X=diabetes_no_outliers.drop('Outcome',axis = 1)
y = diabetes_no_outliers['Outcome']
diabetes_no_outliers

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


### Working with missing data

In [408]:
# We will apply median imputation for missing data
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = 'median')
imputer.fit(X)
X_imputed = imputer.transform(X)
# Create a new DataFrame from the imputed data
X_imputed_df = pd.DataFrame(X_imputed, columns=X.columns)

# Display the resulting DataFrame
X = X_imputed_df
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0
1,1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0
2,8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0
...,...,...,...,...,...,...,...,...
755,10.0,101.0,76.0,48.0,180.0,32.9,0.171,63.0
756,2.0,122.0,70.0,27.0,0.0,36.8,0.340,27.0
757,5.0,121.0,72.0,23.0,112.0,26.2,0.245,30.0
758,1.0,126.0,60.0,0.0,0.0,30.1,0.349,47.0


### Preprocessing

In [409]:
#Applying a standard scaler
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
scaler.fit(X)
# Transform the data
X_scaled = scaler.transform(X)

# Create a new DataFrame from the scaled data
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# Display the resulting DataFrame
X = X_scaled_df

### Feature Extraction

In [410]:
X["Pregnancies"] = X["Pregnancies"].apply(lambda x: int(x))
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0,0.756098,0.000000,0.37500,-0.269307,0.172043,0.669707,1.235294
1,0,-0.780488,-0.333333,0.18750,-0.269307,-0.580645,-0.049511,0.117647
2,1,1.609756,-0.444444,-0.71875,-0.269307,-0.935484,0.786971,0.176471
3,0,-0.682927,-0.333333,0.00000,0.475248,-0.419355,-0.528990,-0.470588
4,0,0.487805,-1.777778,0.37500,1.061386,1.193548,4.998046,0.235294
...,...,...,...,...,...,...,...,...
755,1,-0.390244,0.222222,0.78125,1.156436,0.096774,-0.518567,2.000000
756,0,0.121951,-0.111111,0.12500,-0.269307,0.516129,-0.078176,-0.117647
757,0,0.097561,0.000000,0.00000,0.617822,-0.623656,-0.325733,0.058824
758,0,0.219512,-0.666667,-0.71875,-0.269307,-0.204301,-0.054723,1.058824


### Train model

In [411]:
from sklearn.model_selection import train_test_split
y = diabetes_no_outliers['Outcome']
print(y)

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 760, dtype: int64


In [412]:
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0,0.756098,0.000000,0.37500,-0.269307,0.172043,0.669707,1.235294
1,0,-0.780488,-0.333333,0.18750,-0.269307,-0.580645,-0.049511,0.117647
2,1,1.609756,-0.444444,-0.71875,-0.269307,-0.935484,0.786971,0.176471
3,0,-0.682927,-0.333333,0.00000,0.475248,-0.419355,-0.528990,-0.470588
4,0,0.487805,-1.777778,0.37500,1.061386,1.193548,4.998046,0.235294
...,...,...,...,...,...,...,...,...
755,1,-0.390244,0.222222,0.78125,1.156436,0.096774,-0.518567,2.000000
756,0,0.121951,-0.111111,0.12500,-0.269307,0.516129,-0.078176,-0.117647
757,0,0.097561,0.000000,0.00000,0.617822,-0.623656,-0.325733,0.058824
758,0,0.219512,-0.666667,-0.71875,-0.269307,-0.204301,-0.054723,1.058824


In [413]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

In [414]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [415]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [416]:
from sklearn.model_selection import cross_val_score

In [417]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
#knn = KNeighborsClassifier(n_neighbors=25)
# Fit the model to the data
knn.fit(X_train, y_train)

# Predict the class labels for the test set
#print('K neighbors algorithm')
#y_pred = knn.predict(X_test)
#print(accuracy_score(y_pred, y_test))

print('Random Forest Algorithm')
rf_model = RandomForestClassifier(n_estimators=14, random_state=46).fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print(accuracy_score(y_pred, y_test))


Random Forest Algorithm
0.8223684210526315
