# Importing Libraries

In [None]:
!pip install numpy
!pip install pandas
!pip install matplotlib
!pip install seaborn
!pip install plotly
!pip install sklearn

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Importing Data

In [3]:
df = pd.read_csv("train_buys_computer.csv", index_col=0)
df_test = pd.read_csv("test_buys_computer.csv", index_col=0)


In [4]:
df.head()

Unnamed: 0,age,income,student,credit_rating,buys_computer
1,youth,high,no,fair,no
2,youth,high,no,excellent,no
3,middle_aged,high,no,fair,yes
4,senior,medium,no,fair,yes
5,senior,low,yes,fair,yes


In [5]:
for i in df.columns:
    print("-------------------")
    print(i," : ",df[i].unique())

-------------------
age  :  ['youth' 'middle_aged' 'senior']
-------------------
income  :  ['high' 'medium' 'low']
-------------------
student  :  ['no' 'yes']
-------------------
credit_rating  :  ['fair' 'excellent']
-------------------
buys_computer  :  ['no' 'yes']


In [6]:
#info of train dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14 entries, 1 to 14
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   age            14 non-null     object
 1   income         14 non-null     object
 2   student        14 non-null     object
 3   credit_rating  14 non-null     object
 4   buys_computer  14 non-null     object
dtypes: object(5)
memory usage: 672.0+ bytes


# Cleaning the Dataset 

In [7]:
#null values in dataset
df.isnull().sum()

age              0
income           0
student          0
credit_rating    0
buys_computer    0
dtype: int64

In [None]:
#removing null values
df = df.dropna(how='any',axis=0)

In [None]:
df.isnull().sum()

In [None]:
#removing unnecessary variable
df.drop(['fnlwgt'], inplace=True, axis='columns')

In [None]:
df.head(50)

# Exploring the Dataset 

### Numerical variables

In [None]:
#exploring the age variable
df['student'].describe()

In [None]:
#boxplot for age variable
fig = px.box(df, y='buys_computer', color='income', title='Income level based on the age of an individual', labels={'y':'Buys Computer', 'x':'student'})
fig.show()

In [None]:
#exploring educationnum variable
df['educationnum'].describe()

In [None]:
#boxplot for educationnum variable
fig = px.box(df, y='educationnum', color='incomelevel', title='Years of education distribution for different income levels', labels={'y':'Age', 'x':'Income level'})
fig.show()

In [None]:
#exploring capital-gain capital loss variable
df['capitalgain'].describe()

In [None]:
df['capitalloss'].describe()

In [None]:
#exploring hours/week variable
df['hoursperweek'].describe()

In [None]:
#boxplot for hoursperweek variable
fig = px.box(df, y='hoursperweek', color='incomelevel', title='Hours per week distribution for different income levels', labels={'y':'Age', 'x':'Income level'})
fig.show()

### Categorical variables

In [None]:
g = sns.FacetGrid(df, col='buys_computer')
g.map(plt.hist, "age")
g.set_xticklabels(rotation=90)

In [None]:
g = sns.FacetGrid(df, col='buys_computer')
g.map(plt.hist, "income")
g.set_xticklabels(rotation=90)

In [None]:
g = sns.FacetGrid(df, col='buys_computer')
g.map(plt.hist, "student")
g.set_xticklabels(rotation=90)

In [None]:
g = sns.FacetGrid(df, col='buys_computer')
g.map(plt.hist, "credit_rating")
g.set_xticklabels(rotation=90)

In [None]:
g = sns.FacetGrid(df, col='buys_computer')
g.map(plt.hist, "sex")
g.set_xticklabels(rotation=90)

In [None]:
g = sns.FacetGrid(df, col='incomelevel')
g.map(plt.hist, "race")
g.set_xticklabels(rotation=90)

# Handling Categorical Variables

In [8]:
cat_cols = df[['age','income','student','credit_rating','buys_computer']]

for feature in cat_cols.columns[:]:
    print(feature,':',len(cat_cols[feature].unique()),'labels')

age : 3 labels
income : 3 labels
student : 2 labels
credit_rating : 2 labels
buys_computer : 2 labels


### Income Level Variable

In [9]:
df['buys_computer'].unique()

dictionary={'no':0,'yes':1}
df['buys_computer'] = df.buys_computer.map(dictionary)

df.head()

Unnamed: 0,age,income,student,credit_rating,buys_computer
1,youth,high,no,fair,0
2,youth,high,no,excellent,0
3,middle_aged,high,no,fair,1
4,senior,medium,no,fair,1
5,senior,low,yes,fair,1


### Workclass variable

In [10]:
df['age'].unique()

age_labels = df.groupby(['age'])['buys_computer'].mean().sort_values().index
print(age_labels)

age_labels2={k:i for i,k in enumerate(age_labels,0)}
print(age_labels2)

df['age'] = df['age'].map(age_labels2)

df.head()

Index(['youth', 'senior', 'middle_aged'], dtype='object', name='age')
{'youth': 0, 'senior': 1, 'middle_aged': 2}


Unnamed: 0,age,income,student,credit_rating,buys_computer
1,0,high,no,fair,0
2,0,high,no,excellent,0
3,2,high,no,fair,1
4,1,medium,no,fair,1
5,1,low,yes,fair,1


### Education

In [11]:
df['income'].unique()

income_labels = df.groupby(['income'])['buys_computer'].mean().sort_values().index
print(income_labels)

income_labels2={k:i for i,k in enumerate(income_labels,0)}
print(income_labels2)

df['income'] = df['income'].map(income_labels2)

df.head()

Index(['high', 'medium', 'low'], dtype='object', name='income')
{'high': 0, 'medium': 1, 'low': 2}


Unnamed: 0,age,income,student,credit_rating,buys_computer
1,0,0,no,fair,0
2,0,0,no,excellent,0
3,2,0,no,fair,1
4,1,1,no,fair,1
5,1,2,yes,fair,1


### Marital Status

In [12]:
df['student'].unique()

student_labels = df.groupby(['student'])['buys_computer'].mean().sort_values().index
print(student_labels)

student_labels2={k:i for i,k in enumerate(student_labels,0)}
print(student_labels2)

df['student'] = df['student'].map(student_labels2)

df.head()

Index(['no', 'yes'], dtype='object', name='student')
{'no': 0, 'yes': 1}


Unnamed: 0,age,income,student,credit_rating,buys_computer
1,0,0,0,fair,0
2,0,0,0,excellent,0
3,2,0,0,fair,1
4,1,1,0,fair,1
5,1,2,1,fair,1


### Occupation

In [13]:
df['credit_rating'].unique()

credit_rating_labels = df.groupby(['credit_rating'])['buys_computer'].mean().sort_values().index
print(credit_rating_labels)

credit_rating_labels2={k:i for i,k in enumerate(credit_rating_labels,0)}
print(credit_rating_labels2)

df['credit_rating'] = df['credit_rating'].map(credit_rating_labels2)

df.head()

Index(['excellent', 'fair'], dtype='object', name='credit_rating')
{'excellent': 0, 'fair': 1}


Unnamed: 0,age,income,student,credit_rating,buys_computer
1,0,0,0,1,0
2,0,0,0,0,0
3,2,0,0,1,1
4,1,1,0,1,1
5,1,2,1,1,1


### Relationship

In [None]:
df['relationship'].unique()

rel_labels = df.groupby(['relationship'])['incomelevel'].mean().sort_values().index
print(rel_labels)

rel_labels2={k:i for i,k in enumerate(rel_labels,0)}
print(rel_labels2)

df['relationship'] = df['relationship'].map(rel_labels2)

### Race

In [None]:
df['race'].unique()

race_labels = df.groupby(['race'])['incomelevel'].mean().sort_values().index
print(race_labels)


race_labels2={k:i for i,k in enumerate(race_labels,0)}
print(race_labels2)

df['race'] = df['race'].map(race_labels2)

### Gender 

In [None]:
df['sex'].unique()

sex_labels = df.groupby(['sex'])['incomelevel'].mean().sort_values().index
print(sex_labels)

sex_labels2={k:i for i,k in enumerate(sex_labels,0)}
sex_labels2

df['sex'] = df['sex'].map(sex_labels2)

### Native country

In [None]:
df['nativecountry'].unique()

nc_labels = df.groupby(['nativecountry'])['incomelevel'].mean().sort_values().index
print(nc_labels)


nc_labels2={k:i for i,k in enumerate(nc_labels,0)}
print(nc_labels2)

df['nativecountry'] = df['nativecountry'].map(nc_labels2)

In [None]:
df.head(50)

# Building Model 

In [14]:
from sklearn.model_selection import train_test_split
import sklearn as sk

In [15]:
X = df.iloc[:,:-1]

In [16]:
y = df.iloc[:,-1]

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=42)

### Decision Tree

In [19]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier().fit(X_train, y_train)
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

Accuracy of Decision Tree classifier on training set: 1.00
Accuracy of Decision Tree classifier on test set: 1.00


In [20]:
pred = clf.predict(X_test)

In [21]:
#Creating the Confusion matrix  
from sklearn.metrics import confusion_matrix  
cm= confusion_matrix(y_test, pred)  
cm

array([[1]], dtype=int64)

In [22]:
from sklearn.metrics import classification_report
report = classification_report(y_test, pred)
print(report)

              precision    recall  f1-score   support

           1       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1



In [23]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

1.0

# Predicting actual test values

#### Performing all the cleaning anf feature engineerig process in test dataset

In [None]:
df_test = df_test.iloc[1:,:]

In [None]:
df_test.replace('NaN', np.nan, inplace=True)
df_test.fillna('', inplace=True)

In [None]:
df_test.head()

In [None]:
df_test.isnull().sum()

### Categorical Values 

In [None]:
df_test.head()

In [None]:
df_test['buys_computer'].unique()

dictionary={'no':0,'yes':1}
df_test['buys_computer'] = df_test.buys_computer.map(dictionary)

df_test.head()

In [None]:
df_test['age'].unique()

age_labels = df_test.groupby(['age'])['buys_computer'].mean().sort_values().index
print(age_labels)

age_labels2={k:i for i,k in enumerate(age_labels,0)}
print(age_labels2)

df_test['age'] = df_test['age'].map(age_labels2)

df_test.head()

In [None]:
df_test['income'].unique()

income_labels = df_test.groupby(['income'])['buys_computer'].mean().sort_values().index
print(income_labels)

income_labels2={k:i for i,k in enumerate(income_labels,0)}
print(income_labels2)

df_test['income'] = df_test['income'].map(income_labels2)

df_test.head()

In [None]:
df_test['student'].unique()

student_labels = df_test.groupby(['student'])['buys_computer'].mean().sort_values().index
print(student_labels)

student_labels2={k:i for i,k in enumerate(student_labels,0)}
print(student_labels2)

df_test['student'] = df_test['student'].map(student_labels2)

df_test.head()

In [None]:
df_test['credit_rating'].unique()

credit_rating_labels = df_test.groupby(['credit_rating'])['buys_computer'].mean().sort_values().index
print(credit_rating_labels)

credit_rating_labels2={k:i for i,k in enumerate(credit_rating_labels,0)}
print(credit_rating_labels2)

df_test['credit_rating'] = df_test['credit_rating'].map(credit_rating_labels2)

df_test.head()

In [None]:
df_test['relationship'].unique()

rel_labels = df_test.groupby(['relationship'])['incomelevel'].mean().sort_values().index
print(rel_labels)

enumerate(rel_labels,0)

rel_labels2={k:i for i,k in enumerate(rel_labels,0)}
rel_labels2

df_test['relationship'] = df_test['relationship'].map(rel_labels2)

In [None]:
df_test['race'].unique()

race_labels = df_test.groupby(['race'])['incomelevel'].mean().sort_values().index
print(race_labels)

enumerate(race_labels,0)

race_labels2={k:i for i,k in enumerate(race_labels,0)}
race_labels2

df_test['race'] = df_test['race'].map(race_labels2)

In [None]:
df_test['sex'].unique()

sex_labels = df_test.groupby(['sex'])['incomelevel'].mean().sort_values().index
print(sex_labels)

enumerate(sex_labels,0)

sex_labels2={k:i for i,k in enumerate(sex_labels,0)}
sex_labels2

df_test['sex'] = df_test['sex'].map(sex_labels2)

In [None]:
df_test['nativecountry'].unique()

nc_labels = df_test.groupby(['nativecountry'])['incomelevel'].mean().sort_values().index
print(nc_labels)

enumerate(nc_labels,0)

nc_labels2={k:i for i,k in enumerate(nc_labels,0)}
nc_labels2

df_test['nativecountry'] = df_test['nativecountry'].map(nc_labels2)

In [None]:
df_test.head()

In [None]:
X = df_test.iloc[:, :-1].values
y = df_test.iloc[:,-1]

In [None]:
y

### Creating a Pickle file

In [24]:
import pickle
file = open('buys_computer_pred.pkl','wb')
#dump information to that file
pickle.dump(clf, file)

In [None]:
# pip install flask
# pip install os
# pip install jsonify
# pip install requests