# **Exploratory Data Analysis EDA**



> Necessary imports



In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from xgboost import XGBClassifier
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split



> Loading the dataset as a pandas dataframe 


In [None]:
data = pd.read_csv('parkinsons.data')
data.head(10)

> Checking the columns names and datatypes of the dataset 

In [None]:
data.dtypes

> Dropping name column as it's not necessary 

In [None]:
data = data.drop('name', axis=1)

> Checking the basic statistics of each one of the **columns**

In [None]:
data.describe()

> Creating a ***Histogram*** for each of the columns

In [None]:
fig = plt.figure(figsize = (15,20))
ax = fig.gca()
data.hist(ax = ax)
plt.show()

> Checking The skewness of the dataset

In [None]:
data.skew()

> Checking the correlation between features

In [None]:
data.corr()

>Till the very moment we can tell that:
 * A the status is the target and its datatype is *integer*, So as the name (and the values [0 or 1]) tells this is a binary classification task.
 * The some of the features of the dataset are skewed (either to left-hand or to right-hand).
 * There are many high correlated features in the datasets.





> Seperating features and target

In [None]:
target  = data['status']
features = data.drop(columns=['status'], axis=1)

In [None]:
features.head(10)

> Checking if there exists any null value

In [None]:
features.isnull().sum()

In [None]:
features.shape

> Scaling the data

In [None]:
scaler = MinMaxScaler()
X=scaler.fit_transform(features)
y=target

> Reducing the high dimensionality

In [None]:
pca = PCA(0.9)
X = pca.fit_transform(X)

In [None]:
X.shape

> Splitting the dataset

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.2, random_state=7)

> Fitting an XGBoost Classifier

In [None]:
clf = XGBClassifier()
clf.fit(X_train, y_train)

In [None]:
y_test_pred = clf.predict(X_test)
y_train_pred = clf.predict(X_train)

print(f"Accuracy at testing set {round(accuracy_score(y_test, y_test_pred), 2)}")
print(f"Accuracy at tarining set {round(accuracy_score(y_train, y_train_pred), 2)}")