### Projeto - Análise e predição de desempenho de alunos para seu próximo emprego.

In [1]:
# Datas e Horas
from datetime import datetime

# Manipulação de Dados
import numpy as np
import pandas as pd

# Visualização de Dados
import matplotlib.pyplot as plt
import seaborn as sns

# Pré-Processamento
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split

# Processamento ML
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Avaliação de Desempenho
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


In [2]:
# Lendo dataset
dt = pd.read_csv('campus.csv')

# Primeiras linhas
dt.head(10)

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,4,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0
5,6,M,55.0,Others,49.8,Others,Science,67.25,Sci&Tech,Yes,55.0,Mkt&Fin,51.58,Not Placed,
6,7,F,46.0,Others,49.2,Others,Commerce,79.0,Comm&Mgmt,No,74.28,Mkt&Fin,53.29,Not Placed,
7,8,M,82.0,Central,64.0,Central,Science,66.0,Sci&Tech,Yes,67.0,Mkt&Fin,62.14,Placed,252000.0
8,9,M,73.0,Central,79.0,Central,Commerce,72.0,Comm&Mgmt,No,91.34,Mkt&Fin,61.29,Placed,231000.0
9,10,M,58.0,Central,70.0,Central,Commerce,61.0,Comm&Mgmt,No,54.0,Mkt&Fin,52.21,Not Placed,


In [3]:
# Descrição
dt.describe()

Unnamed: 0,sl_no,ssc_p,hsc_p,degree_p,etest_p,mba_p,salary
count,215.0,215.0,215.0,215.0,215.0,215.0,148.0
mean,108.0,67.303395,66.333163,66.370186,72.100558,62.278186,288655.405405
std,62.209324,10.827205,10.897509,7.358743,13.275956,5.833385,93457.45242
min,1.0,40.89,37.0,50.0,50.0,51.21,200000.0
25%,54.5,60.6,60.9,61.0,60.0,57.945,240000.0
50%,108.0,67.0,65.0,66.0,71.0,62.0,265000.0
75%,161.5,75.7,73.0,72.0,83.5,66.255,300000.0
max,215.0,89.4,97.7,91.0,98.0,77.89,940000.0


In [5]:
# Avaliando dataset
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sl_no           215 non-null    int64  
 1   gender          215 non-null    object 
 2   ssc_p           215 non-null    float64
 3   ssc_b           215 non-null    object 
 4   hsc_p           215 non-null    float64
 5   hsc_b           215 non-null    object 
 6   hsc_s           215 non-null    object 
 7   degree_p        215 non-null    float64
 8   degree_t        215 non-null    object 
 9   workex          215 non-null    object 
 10  etest_p         215 non-null    float64
 11  specialisation  215 non-null    object 
 12  mba_p           215 non-null    float64
 13  status          215 non-null    object 
 14  salary          148 non-null    float64
dtypes: float64(6), int64(1), object(8)
memory usage: 25.3+ KB


##### Otimizando o dataset

In [7]:
dt.memory_usage(deep=True)

Index               128
sl_no              1720
gender            12470
ssc_p              1720
ssc_b             13661
hsc_p              1720
hsc_b             13629
hsc_s             13840
degree_p           1720
degree_t          14098
workex            12759
etest_p            1720
specialisation    13665
mba_p              1720
status            13813
salary             1720
dtype: int64

In [9]:
# Converter coluna Gender
dt['gender'] = dt.gender.astype('category')

In [10]:
# Verificando
dt.memory_usage(deep=True)

Index               128
sl_no              1720
gender              439
ssc_p              1720
ssc_b             13661
hsc_p              1720
hsc_b             13629
hsc_s             13840
degree_p           1720
degree_t          14098
workex            12759
etest_p            1720
specialisation    13665
mba_p              1720
status            13813
salary             1720
dtype: int64

##### OBS: Ao alterar o tipo de dado para 'Category', pudemos ver que há uma otimização no consumo de memória. Irei realizar o mesmo procedimento nas demais colunas.

In [11]:
cat_cols = [col for col in dt.columns if dt[col].dtype=='O']
cat_cols

['ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex', 'specialisation', 'status']

In [12]:
for col in cat_cols:
    print(dt[col].unique())

['Others' 'Central']
['Others' 'Central']
['Commerce' 'Science' 'Arts']
['Sci&Tech' 'Comm&Mgmt' 'Others']
['No' 'Yes']
['Mkt&HR' 'Mkt&Fin']
['Placed' 'Not Placed']


In [13]:
for col in cat_cols:
    dt[col] = dt[col].astype('category')

In [14]:
dt.memory_usage(deep=True)

Index              128
sl_no             1720
gender             439
ssc_p             1720
ssc_b              450
hsc_p             1720
hsc_b              450
hsc_s              513
degree_p          1720
degree_t           517
workex             442
etest_p           1720
specialisation     450
mba_p             1720
status             453
salary            1720
dtype: int64

#### O consumo de memória melhor bastante

#### Analisando os Missings

In [15]:
# Duplicados
dt.duplicated().sum()

0

In [16]:
# Valores nulos
dt.isnull().sum()

sl_no              0
gender             0
ssc_p              0
ssc_b              0
hsc_p              0
hsc_b              0
hsc_s              0
degree_p           0
degree_t           0
workex             0
etest_p            0
specialisation     0
mba_p              0
status             0
salary            67
dtype: int64