# Income Qualification.

## Importing required libraries and dataset.

In [None]:
import pandas as pd
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
%matplotlib inline

In [None]:
train=pd.read_csv("train.csv")

In [None]:
test=pd.read_csv("test.csv")

In [None]:
display(train.shape,train.head())

In [None]:
display(test.shape,test.head())

## Let us identify our target variable

In [None]:

for i in train.columns:
    if i not in test.columns:
        print("Our Target variable is {}".format(i))

## Lets Understand the type of data.

In [None]:
print(train.dtypes.value_counts())

In [None]:
print(train.info())

In [None]:
train_info =pd.DataFrame(columns=['Name of col','Num of Null','Dtype','N_unique'])

for i in range(0,len(train.columns)):
    train_info.loc[i]=[train.columns[i],
                    train[train.columns[i]].isnull().sum(),
                    train[train.columns[i]].dtypes,
                    train[train.columns[i]].nunique()]
    train_info

In [None]:
train_info

In [None]:
train.describe(include=['O'])

# Check If there are any Biases in the dataset 

In [None]:
train["Target"].value_counts()

In [None]:
train["Target"].value_counts().plot.bar(width=0.4,color='c',edgecolor='k',linewidth=1)
plt.xlabel("Target Values")
plt.ylabel("Count Of Households")
plt.title("Target Column Spred")
plt.show()

## Check whether all members of the house have the same poverty level 

In [None]:
# Household with the same Poverty Level 
poverty_equal = train.groupby('idhogar')['Target'].apply(lambda X:X.nunique()==1)

# Household with un-equal poverty level 
poverty_unequal = poverty_equal[poverty_equal != True]

print( " There are {} Households where all the family members of the house do not have same Povert Level.".format(len(poverty_unequal)))


## Check if there is a House withouut a family head.

In [None]:
train.columns

In [None]:
# Households with the head 

household_head =train.groupby('idhogar')['parentesco1'].sum()

In [None]:
# Househods without head 
household_without_head = train.loc[train['idhogar'].isin(household_head[household_head == 0].index), :]
household_without_head['idhogar'].nunique()

###  there are 15 Households without a Head 

## Set Poverty level of the members and the head of the House within a family 

In [None]:
Poverty_level=train[train['v2a1'] !=0]

In [None]:
Poverty_level.shape

In [None]:
poverty_level=Poverty_level.groupby('area1')['v2a1'].apply(np.median)

In [None]:
poverty_level

### For rural area level if people paying rent less than 8000 is under poverty level.
###  For Urban area level if people paying rent less than 140000 is under poverty level.

In [None]:
def povert(x):
    if x<8000:
        return('Below poverty level')
    
    elif x>140000:
        return('Above poverty level')
    elif x<140000:
        return('Below poverty level: Ur-ban ; Above poverty level : Rural ')
   

In [None]:
Poverty=Poverty_level['v2a1'].apply(povert)

In [None]:
Poverty.shape

In [None]:
pd.crosstab(Poverty,Poverty_level['area1']).T

## Count how many null values are Existing in columns 


In [None]:
train.isnull().sum()

In [None]:

for i in range(0,len(train.columns)):
    train_info.loc[i]=[train.columns[i],
                    train[train.columns[i]].isnull().sum(),
                    train[train.columns[i]].dtypes,
                    train[train.columns[i]].nunique()]
    

In [None]:
train_info

In [None]:
train_info[train_info["Num of Null"]>0]

In [None]:
train_info["Num of Null"].sum()

## Remove null value rows of the target variable.

In [None]:
train.Target.isnull().sum()

### There are NO Null values in the Target variable.

## Treating Mix values 

In [None]:
train.loc[:,["dependency","edjefe","edjefa"]].head()

In [None]:
mapping ={"yes": 1, "no": 0}

for data in [train,test]:
    data["dependency"] =data["dependency"].replace(mapping).astype(float)
    data["edjefe"] =data["edjefe"].replace(mapping).astype(float)
    data["edjefa"] =data["edjefa"].replace(mapping).astype(float)
    

In [None]:
train.loc[:,["dependency","edjefe","edjefa"]].head()

In [None]:
train[train['v2a1'].isnull()].head()

In [None]:
float_col=[]
for i in train.columns:
    a=train[i].dtype
    if a == 'float64':
        float_col.append(i)
print(float_col)

In [None]:
train[float_col].isna().sum()

In [None]:
train['v2a1'].fillna(0,inplace=True)
train['v18q1'].fillna(0,inplace=True)

In [None]:
train.drop(['Id','idhogar','tipovivi3', 'v18q','rez_esc','elimbasu5'],axis=1,inplace=True)

In [None]:
train['meaneduc'].fillna(np.mean(train['meaneduc']),inplace=True)
train['SQBmeaned'].fillna(np.mean(train['SQBmeaned']),inplace=True)
print(train.isna().sum().value_counts())

In [None]:
int_col=[]
for i in train.columns:
    a=train[i].dtype
    if a == 'int64':
        int_col.append(i)
print(int_col)

In [None]:
train[int_col].isna().sum().value_counts()

- There are No Null values in Dataset

## Predict the accuracy using random forest classifier.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [None]:
X=train.drop('Target',axis=1)
y=train.Target

In [None]:
X_col=X.columns

In [None]:

from sklearn.preprocessing import StandardScaler
SS=StandardScaler()
X_1=SS.fit_transform(X)
X_1=pd.DataFrame(X,columns=X_col)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(X_1,y,test_size=0.25,stratify=y,random_state=0)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

rfc=RandomForestClassifier(random_state=0)
parameters={'n_estimators':[10,50,100,300],'max_depth':[3,5,10,15]}
grid=zip([rfc],[parameters])

best_=None

for i, j in grid:
    a=GridSearchCV(i,param_grid=j,cv=3,n_jobs=1)
    a.fit(X_train,Y_train)
    if best_ is None:
        best_=a
    elif a.best_score_>best_.best_score_:
        best_=a
        
        
print ("Best CV Score",best_.best_score_)
print ("Model Parameters",best_.best_params_)
print("Best Estimator",best_.best_estimator_)

In [None]:

RFC=best_.best_estimator_
Model=RFC.fit(X_train,Y_train)
pred=Model.predict(X_test)

In [None]:
print('Model Score of train data : {}'.format(Model.score(X_train,Y_train)))
print('Model Score of test data : {}'.format(Model.score(X_test,Y_test)))

In [None]:
from sklearn.metrics import confusion_matrix,f1_score,accuracy_score

In [None]:
confusion_matrix(Y_test,pred)

In [None]:
accuracy_score(Y_test,pred)

### Conclusion :
### *Using RandomForest Classifier we can predict test_data with accuracy of 88%.*

## Check the accuracy using random forest with cross validation.¶

In [None]:
from sklearn.model_selection import KFold,cross_val_score

seed = np.random.seed(21)

In [None]:
kfold = KFold(n_splits=4,random_state=seed,shuffle=True)
print(cross_val_score(RFC, X, y, cv=kfold, scoring='accuracy'))

In [None]:
print(cross_val_score(RFC, X, y, cv=kfold, scoring='accuracy').mean())

### Conclusion : Using RandomForest Classifier and Corss Validation we can predict test_data with accuracy of 88%.