# Identify the output variable

In [329]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [330]:
import warnings
warnings.filterwarnings('ignore')

In [331]:
train = pd.read_csv(r'Datasets\train.csv')
test = pd.read_csv(r'Datasets\test.csv')

In [332]:
print('Train shape:',train.shape)
print('Test shape:',test.shape)

Train shape: (9557, 143)
Test shape: (23856, 142)


In [333]:
for column in train.columns:
    if column not in test.columns:
        print("Our target variable is {}".format(column))

Our target variable is Target


# Understand the type of data

In [334]:
print(train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9557 entries, 0 to 9556
Columns: 143 entries, Id to Target
dtypes: float64(8), int64(130), object(5)
memory usage: 10.4+ MB
None


In [335]:
train.head()

Unnamed: 0,Id,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,...,SQBescolari,SQBage,SQBhogar_total,SQBedjefe,SQBhogar_nin,SQBovercrowding,SQBdependency,SQBmeaned,agesq,Target
0,ID_279628684,190000.0,0,3,0,1,1,0,,0,...,100,1849,1,100,0,1.0,0.0,100.0,1849,4
1,ID_f29eb3ddd,135000.0,0,4,0,1,1,1,1.0,0,...,144,4489,1,144,0,1.0,64.0,144.0,4489,4
2,ID_68de51c94,,0,8,0,1,1,0,,0,...,121,8464,1,0,0,0.25,64.0,121.0,8464,4
3,ID_d671db89c,180000.0,0,5,0,1,1,1,1.0,0,...,81,289,16,121,4,1.777778,1.0,121.0,289,4
4,ID_d56d6f5f5,180000.0,0,5,0,1,1,1,1.0,0,...,121,1369,16,121,4,1.777778,1.0,121.0,1369,4


In [336]:
# Select the columns of dtype 'object'
object_cols = train.select_dtypes(include=['object']).columns

for col in object_cols:
    print(col)


Id
idhogar
dependency
edjefe
edjefa


In [337]:
# Get the value counts of 'dependency'
print(train['dependency'].value_counts())

yes          2192
no           1747
.5           1497
2             730
1.5           713
.33333334     598
.66666669     487
8             378
.25           260
3             236
4             100
.75            98
.2             90
.40000001      84
1.3333334      84
2.5            77
5              24
1.25           18
3.5            18
.80000001      18
2.25           13
.71428573      12
1.75           11
1.2            11
.83333331      11
.22222222      11
.2857143        9
1.6666666       8
.60000002       8
6               7
.16666667       7
Name: dependency, dtype: int64


In [338]:
'''
Convert yes to 1 and no to 0 and keep the rest as is.
'''

'\nConvert yes to 1 and no to 0 and keep the rest as is.\n'

In [339]:
mapping_dict = {'yes': 1.0, 'no': 0.0}
train['dependency'] = train['dependency'].map(mapping_dict).astype(float)
train['edjefe'] = train['edjefe'].map(mapping_dict).astype(float)
train['edjefa'] = train['edjefa'].map(mapping_dict).astype(float)


In [340]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9557 entries, 0 to 9556
Columns: 143 entries, Id to Target
dtypes: float64(11), int64(130), object(2)
memory usage: 10.4+ MB


In [341]:
# Compute variance along the specified axis=0 and filter those equal to 0
zero_var_columns = pd.DataFrame(np.var(train, 0)).loc[lambda df: df[0] == 0]

# Print out those columns
print("Columns with variance 0:")
print(list(zero_var_columns.index))

Columns with variance 0:
['elimbasu5']


In [342]:
train.drop(['elimbasu5'], axis=1)

Unnamed: 0,Id,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,...,SQBescolari,SQBage,SQBhogar_total,SQBedjefe,SQBhogar_nin,SQBovercrowding,SQBdependency,SQBmeaned,agesq,Target
0,ID_279628684,190000.0,0,3,0,1,1,0,,0,...,100,1849,1,100,0,1.000000,0.0000,100.0000,1849,4
1,ID_f29eb3ddd,135000.0,0,4,0,1,1,1,1.0,0,...,144,4489,1,144,0,1.000000,64.0000,144.0000,4489,4
2,ID_68de51c94,,0,8,0,1,1,0,,0,...,121,8464,1,0,0,0.250000,64.0000,121.0000,8464,4
3,ID_d671db89c,180000.0,0,5,0,1,1,1,1.0,0,...,81,289,16,121,4,1.777778,1.0000,121.0000,289,4
4,ID_d56d6f5f5,180000.0,0,5,0,1,1,1,1.0,0,...,121,1369,16,121,4,1.777778,1.0000,121.0000,1369,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9552,ID_d45ae367d,80000.0,0,6,0,1,1,0,,0,...,81,2116,25,81,1,1.562500,0.0625,68.0625,2116,2
9553,ID_c94744e07,80000.0,0,6,0,1,1,0,,0,...,0,4,25,81,1,1.562500,0.0625,68.0625,4,2
9554,ID_85fc658f8,80000.0,0,6,0,1,1,0,,0,...,25,2500,25,81,1,1.562500,0.0625,68.0625,2500,2
9555,ID_ced540c61,80000.0,0,6,0,1,1,0,,0,...,121,676,25,81,1,1.562500,0.0625,68.0625,676,2


# Check if there are any biases in your dataset

In [343]:
import itertools
from scipy.stats import chi2_contingency

def check_relationships(data, target, columns):
    pairs = [(target, var) for var in columns if var != target]

    alpha = 0.05

    results = []  # a list to store pairs that match the conditions

    for pair in pairs:
        var1, var2 = pair
        contingency_tab = pd.crosstab(data[var1], data[var2])
        chi2, p_value, dof, expected = chi2_contingency(contingency_tab)

        if chi2 > 0 and dof > 0 and p_value > 0:
            results.append((var1, var2, chi2, dof, p_value))

    return results  # return the results

# Call the function
results = check_relationships(train, 'Target', columns_to_check)

# Print the pairs that match the conditions
for var1, var2, chi2, dof, p_value in results:
    print(f"Chi-square test results for {var1} and {var2}:")
    print('Chi-square statistic:', chi2)
    print('Degrees of freedom:', dof)
    print('p-value:', p_value)
    print("\n")

# Example usage
columns_to_check = ['hogar_total', 'meaneduc', 'dependency', 'edjefe', 'edjefa', 'overcrowding', 'instlevel1', 'instlevel2', 'instlevel3', 'instlevel4', 'instlevel5', 'instlevel6', 'instlevel7', 'instlevel8', 'instlevel9','Target']

print(results)


Chi-square test results for Target and hogar_total:
Chi-square statistic: 613.1167059247191
Degrees of freedom: 36
p-value: 4.0526927255502183e-106


Chi-square test results for Target and dependency:
Chi-square statistic: 305.64882968869176
Degrees of freedom: 3
p-value: 5.958889726123204e-66


Chi-square test results for Target and edjefe:
Chi-square statistic: 9.312378000453501
Degrees of freedom: 3
p-value: 0.02541343494315061


Chi-square test results for Target and edjefa:
Chi-square statistic: 29.273115262348572
Degrees of freedom: 3
p-value: 1.9621826150263933e-06


Chi-square test results for Target and overcrowding:
Chi-square statistic: 1726.2789235231867
Degrees of freedom: 111
p-value: 9.025790323407828e-288


Chi-square test results for Target and instlevel1:
Chi-square statistic: 211.96257611805228
Degrees of freedom: 3
p-value: 1.0965177883351574e-45


Chi-square test results for Target and instlevel2:
Chi-square statistic: 255.54642390609632
Degrees of freedom: 3
p-val

# Check whether all members of the house have the same poverty level

In [344]:
# Group the dataframe by 'idhogar' and find the number of unique 'Target' values
unique_poverty_levels = train.groupby('idhogar')['Target'].nunique()

# Filter for households where the number of unique 'Target' values is greater than 1
inconsistent_households = unique_poverty_levels[unique_poverty_levels > 1]

# Print the IDs
print(inconsistent_households.index)

Index(['0172ab1d9', '03f4e5f4d', '0511912b6', '078a0b6e2', '09e25d616',
       '0f3e65c83', '0f9494d3a', '15a891635', '17fb04a62', '18832b840',
       '26b3a0f41', '288579c97', '28893b5e7', '2c9872b82', '2cb443214',
       '309fb7246', '30a70901d', '3c6973219', '3c73c107f', '3df651058',
       '3fe29a56b', '410194c8b', '417865404', '42ec8bef5', '43b9c83e5',
       '44f219a16', '46af47063', '4b6077882', '4c2dba109', '4dc11e11f',
       '4e19bd549', '50e064ee8', '513adb616', '54118d5d9', '55a662731',
       '564eab113', '594d3eb27', '5c3f7725d', '5c6f32bbc', '5e9329fc6',
       '614b48fb7', '636330516', '654ef7612', '67ad49822', '6833ac5dc',
       '694a0cbf4', '6a389f3de', '6bcf799cf', '6c543442a', '71cd52a80',
       '73d85d05d', '7ad269eef', '7c57f8237', '7e9d58c5c', '7ea6aca15',
       '80a66379b', '811a35744', '8242a51ec', '8420bcfca', '8ae3e74ca',
       '8bb6da3c1', '932287f5d', '9bbf7c6ca', 'a20ff33ba', 'a3288e6fa',
       'a94a45642', 'bcaa2e2f5', 'bcab69521', 'bd82509d1', 'be91

# House without a family head

In [345]:
# parentesco1 - indicates if this person is the head of the household.
# Group the dataframe by 'idhogar' and sum 'parentesco1' values
sum_of_heads = train.groupby('idhogar')['parentesco1'].sum()

# Filter for households where the sum of 'parentesco1' is equal to 0
households_without_head = sum_of_heads[sum_of_heads == 0]

# Get the IDs of the households without a head
households_without_head_ids = households_without_head.index

# Display the IDs
display(households_without_head_ids)

Index(['03c6bdf85', '09b195e7a', '1367ab31d', '1bc617b23', '374ca5a19',
       '61c10e099', '6b1b2405f', '896fe6d3e', 'a0812ef17', 'ad687ad89',
       'b1f4d89d7', 'bfd5067c2', 'c0c8a5013', 'd363d9183', 'f2bfa75c4'],
      dtype='object', name='idhogar')

# Set the poverty level of the members and the head of the house same in a family

In [346]:
# Identify the head of each household and their corresponding 'Target' value
head_target = train[train['parentesco1'] == 1][['idhogar', 'Target']]

# Rename the 'Target' column for merging
head_target = head_target.rename(columns = {'Target': 'Head_Target'})

# Merge the original DataFrame with the head_target DataFrame 
train = train.merge(head_target, on = 'idhogar', how = 'left')

# Set the 'Target' of all household members to be the same as the head of the household
train['Target'] = train['Head_Target']

# Drop the 'Head_Target' column as it is no longer needed
train = train.drop(columns = ['Head_Target'])

train.head()


Unnamed: 0,Id,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,...,SQBescolari,SQBage,SQBhogar_total,SQBedjefe,SQBhogar_nin,SQBovercrowding,SQBdependency,SQBmeaned,agesq,Target
0,ID_279628684,190000.0,0,3,0,1,1,0,,0,...,100,1849,1,100,0,1.0,0.0,100.0,1849,4.0
1,ID_f29eb3ddd,135000.0,0,4,0,1,1,1,1.0,0,...,144,4489,1,144,0,1.0,64.0,144.0,4489,4.0
2,ID_68de51c94,,0,8,0,1,1,0,,0,...,121,8464,1,0,0,0.25,64.0,121.0,8464,4.0
3,ID_d671db89c,180000.0,0,5,0,1,1,1,1.0,0,...,81,289,16,121,4,1.777778,1.0,121.0,289,4.0
4,ID_d56d6f5f5,180000.0,0,5,0,1,1,1,1.0,0,...,121,1369,16,121,4,1.777778,1.0,121.0,1369,4.0


# Count how many null values are existing in columns

In [347]:
missing_values =train.isnull().sum()
missing_values = missing_values[missing_values > 0]
print(missing_values)

v2a1          6860
v18q1         7342
rez_esc       7928
dependency    5618
edjefe        5672
edjefa        3258
meaneduc         5
SQBmeaned        5
Target          23
dtype: int64


In [348]:

missing_cols = ['v2a1', 'v18q1', 'rez_esc', 'dependency', 'edjefe', 'edjefa', 'meaneduc', 'SQBmeaned']
train = train.dropna(axis=0, subset=missing_cols)
train.isna().sum()

Id                 0
v2a1               0
hacdor             0
rooms              0
hacapo             0
                  ..
SQBovercrowding    0
SQBdependency      0
SQBmeaned          0
agesq              0
Target             0
Length: 143, dtype: int64

# Remove null value rows of the target variable

In [349]:
train = train.dropna(subset=['Target'])

# Predict the accuracy using random forest classifier

In [350]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [351]:
x=train.drop(['Target'],axis=1)
y=train.Target
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=10)
x_train = x_train.select_dtypes(include=[np.number])  # Keeps only numerical columns


In [352]:
rfc = RandomForestClassifier(n_estimators=600)
rfc.fit(x_train,y_train)

In [353]:
x_test = x_test.drop(['Id', 'idhogar'], axis=1)

In [354]:
predictions = rfc.predict(x_test)
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

         4.0       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1



# Check the accuracy using a random forest with cross-validation

In [355]:
print(predictions)
print(y_test)

[4.]
418    4.0
Name: Target, dtype: float64


In [356]:
print(confusion_matrix(y_test,predictions))

[[1]]
