In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,...,feature40,feature41,feature42,feature43,feature44,feature45,feature46,feature47,feature48,targetfeature
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Gtl,7,...,0,0,0,0,2,2008,,,,208500
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,Gtl,6,...,0,0,0,0,5,2007,,,,181500
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Gtl,7,...,0,0,0,0,9,2008,,,,223500
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Gtl,7,...,0,0,0,0,2,2006,,,,140000
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,Gtl,8,...,0,0,0,0,12,2008,,,,250000


## 1. Delete the features having more than 30% of missing data

In [3]:
# First before deleting any of the data lets first check if there is any missing data.
data.isnull().sum()

feature1            0
feature2            0
feature3          192
feature4            0
feature5            0
feature6            0
feature7            0
feature8            0
feature9            0
feature10           0
feature11           0
feature12           0
feature13           6
feature14           0
feature15           0
feature16           0
feature17           0
feature18           0
feature19           0
feature20           0
feature21           0
feature22           0
feature23           0
feature24           0
feature25           0
feature26           0
feature27           0
feature28           0
feature29           0
feature30           0
feature31           0
feature32         524
feature33          61
feature34           0
feature35           0
feature36           0
feature37           0
feature38           0
feature39           0
feature40           0
feature41           0
feature42           0
feature43           0
feature44           0
feature45           0
feature46 

In [4]:
data.shape

(1099, 49)

In [5]:
# Checking if there are any columns with more than 30% of missing data
cols = data.columns[data.isnull().mean()>0.3]
cols

Index(['feature32', 'feature46', 'feature47', 'feature48'], dtype='object')

In [6]:
#Dropping those columns, using .drop() function
data.drop(cols, axis=1, inplace = True)

In [7]:
data.shape

(1099, 45)

## 2. Take the data set containing only numerical values i.e float and int

In [8]:
#Checking the data types of our dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1099 entries, 0 to 1098
Data columns (total 45 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   feature1       1099 non-null   int64  
 1   feature2       1099 non-null   object 
 2   feature3       907 non-null    float64
 3   feature4       1099 non-null   int64  
 4   feature5       1099 non-null   object 
 5   feature6       1099 non-null   object 
 6   feature7       1099 non-null   object 
 7   feature8       1099 non-null   object 
 8   feature9       1099 non-null   object 
 9   feature10      1099 non-null   int64  
 10  feature11      1099 non-null   int64  
 11  feature12      1099 non-null   int64  
 12  feature13      1093 non-null   float64
 13  feature14      1099 non-null   object 
 14  feature15      1099 non-null   object 
 15  feature16      1099 non-null   int64  
 16  feature17      1099 non-null   int64  
 17  feature18      1099 non-null   int64  
 18  feature1

#### Here you can see there are many columns with object data type
* We cannot dirrectly drop these data sets
     * If we drop we might lose many important features which has really significant effect on the target, instead we have to take each column with this type of data we shall check if there is any relation with the target and then take the call to drop the columns
     * Example: Imagine you are working on a flipkart data set and you are trying to predict the sales of the modile phone, and there are 5-6 different brands in the dataset and as soon as you saw the brand column you deleted it because it was an object type data, What will you predict not? Instead in this senario you'll convert these modile brands and give them some labels so that the system can understand and categorize these types of data sets before predicting. 
* Nor can we directly convert these data sets
     * If we directly convert all the columns in the data it'll effect your system performance because python will create or give an unique ID to every unique value in the dataset which is not at all required. 
     * Example : If you try to convert an date column and you have in a data of 1 whole year, Python will convert all 365 in a stack memory and slow up the system and even after converting this into 365 unique ID's It becomes useless as we cannot work on that data instead if we can convert it into datetime type of data and then can use that column as well without freezing the system.
     
#### But here we are asked to drop these columns with object type of data

In [9]:
# Here we are using dtypes() function for selecting the object data types
df = data.select_dtypes(exclude=['object'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1099 entries, 0 to 1098
Data columns (total 32 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   feature1       1099 non-null   int64  
 1   feature3       907 non-null    float64
 2   feature4       1099 non-null   int64  
 3   feature10      1099 non-null   int64  
 4   feature11      1099 non-null   int64  
 5   feature12      1099 non-null   int64  
 6   feature13      1093 non-null   float64
 7   feature16      1099 non-null   int64  
 8   feature17      1099 non-null   int64  
 9   feature18      1099 non-null   int64  
 10  feature21      1099 non-null   int64  
 11  feature22      1099 non-null   int64  
 12  feature23      1099 non-null   int64  
 13  feature24      1099 non-null   int64  
 14  feature25      1099 non-null   int64  
 15  feature26      1099 non-null   int64  
 16  feature27      1099 non-null   int64  
 17  feature29      1099 non-null   int64  
 18  feature3

## 3. Find which feature is most strongly correlated with the "targetfeature"

* Correlation can be:
    * Positive Correlation: both variables change in the same direction.
    * Neutral Correlation: No relationship in the change of the variables.
    * Negative Correlation: variables change in opposite directions.


In [10]:
# Before selecting any feature we'll need to see the correlation for every variable
df.corr()

Unnamed: 0,feature1,feature3,feature4,feature10,feature11,feature12,feature13,feature16,feature17,feature18,...,feature37,feature38,feature39,feature40,feature41,feature42,feature43,feature44,feature45,targetfeature
feature1,1.0,-0.411931,-0.128634,0.03927,-0.051837,0.029854,0.008712,-0.06584,-0.145217,-0.238489,...,-0.01353,0.003313,-0.011505,-0.0443,-0.030847,-0.012911,-0.020653,0.02429,-0.034821,-0.085574
feature3,-0.411931,1.0,0.388552,0.244048,-0.062015,0.115347,0.189894,0.127721,0.163152,0.326199,...,0.079752,0.140113,0.043092,0.079795,0.055915,0.102612,0.007149,0.015416,0.031762,0.364457
feature4,-0.128634,0.388552,1.0,0.097665,0.002873,0.001255,0.079408,0.188044,-0.008621,0.231442,...,0.173241,0.06461,-0.012484,0.013421,0.043595,0.024397,0.032453,0.011589,-0.011789,0.264546
feature10,0.03927,0.244048,0.097665,1.0,-0.082454,0.569298,0.405603,0.221423,0.324043,0.548815,...,0.237564,0.318582,-0.099835,0.016125,0.086232,0.023575,-0.0341,0.094632,-0.033363,0.796978
feature11,-0.051837,-0.062015,0.002873,-0.082454,1.0,-0.386266,-0.125753,-0.039241,-0.134059,-0.165907,...,0.015668,-0.019749,0.059001,0.015113,0.054854,-0.01718,0.083139,-0.013203,0.028341,-0.081264
feature12,0.029854,0.115347,0.001255,0.569298,-0.386266,1.0,0.316297,0.243144,0.158148,0.399804,...,0.224688,0.181129,-0.376025,0.027614,-0.032875,-0.031791,-0.041583,0.027684,0.013094,0.528665
feature13,0.008712,0.189894,0.079408,0.405603,-0.125753,0.316297,1.0,0.247053,0.107331,0.344003,...,0.163029,0.106431,-0.093626,0.011753,0.079961,-0.011802,-0.026725,-0.005676,-0.001658,0.481259
feature16,-0.06584,0.127721,0.188044,0.221423,-0.039241,0.243144,0.247053,1.0,-0.516091,0.473951,...,0.221892,0.072086,-0.093458,0.003826,0.063286,0.037358,0.001282,-0.016319,0.018243,0.39199
feature17,-0.145217,0.163152,-0.008621,0.324043,-0.134059,0.158148,0.107331,-0.516091,1.0,0.438902,...,-0.00715,0.129023,0.009574,0.018423,-0.000814,-0.047965,-0.035751,0.03858,-0.051567,0.225381
feature18,-0.238489,0.326199,0.231442,0.548815,-0.165907,0.399804,0.344003,0.473951,0.438902,1.0,...,0.249244,0.212929,-0.075349,0.010329,0.102715,0.012587,-0.042308,0.017602,-0.025002,0.640284


#### As we can see there are only numbers and its very difficult to point out the value by just looking at numbers, Hence we'll plot an graph so that we can easily identify the higly correlated variables

In [11]:
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,feature1,feature3,feature4,feature10,feature11,feature12,feature13,feature16,feature17,feature18,feature21,feature22,feature23,feature24,feature25,feature26,feature27,feature29,feature31,feature33,feature34,feature35,feature37,feature38,feature39,feature40,feature41,feature42,feature43,feature44,feature45,targetfeature
feature1,1.0,-0.411931,-0.128634,0.03927,-0.051837,0.029854,0.008712,-0.06584,-0.145217,-0.238489,0.069605,0.023548,-0.017216,0.128697,0.166301,-0.018264,0.274601,0.028836,-0.033591,0.086149,-0.064743,-0.127466,-0.01353,0.003313,-0.011505,-0.0443,-0.030847,-0.012911,-0.020653,0.02429,-0.034821,-0.085574
feature3,-0.411931,1.0,0.388552,0.244048,-0.062015,0.115347,0.189894,0.127721,0.163152,0.326199,0.36974,0.077677,0.004052,0.194293,0.060821,0.291774,-0.013432,0.35008,0.243616,0.065807,0.324353,0.355955,0.079752,0.140113,0.043092,0.079795,0.055915,0.102612,0.007149,0.015416,0.031762,0.364457
feature4,-0.128634,0.388552,1.0,0.097665,0.002873,0.001255,0.079408,0.188044,-0.008621,0.231442,0.234044,0.15525,0.054739,0.130936,-0.006142,0.106587,-0.021588,0.168032,0.263459,-0.036975,0.141295,0.151589,0.173241,0.06461,-0.012484,0.013421,0.043595,0.024397,0.032453,0.011589,-0.011789,0.264546
feature10,0.03927,0.244048,0.097665,1.0,-0.082454,0.569298,0.405603,0.221423,0.324043,0.548815,0.603825,0.103583,-0.044103,0.551418,0.29239,0.105033,-0.186655,0.435735,0.391297,0.547004,0.627801,0.578508,0.237564,0.318582,-0.099835,0.016125,0.086232,0.023575,-0.0341,0.094632,-0.033363,0.796978
feature11,-0.051837,-0.062015,0.002873,-0.082454,1.0,-0.386266,-0.125753,-0.039241,-0.134059,-0.165907,-0.078568,-0.044501,0.097218,-0.188255,-0.041568,0.020448,-0.078326,-0.056994,-0.02886,-0.319665,-0.179241,-0.142185,0.015668,-0.019749,0.059001,0.015113,0.054854,-0.01718,0.083139,-0.013203,0.028341,-0.081264
feature12,0.029854,0.115347,0.001255,0.569298,-0.386266,1.0,0.316297,0.243144,0.158148,0.399804,0.215855,0.166186,-0.047261,0.474267,0.253283,-0.059272,-0.177116,0.108467,0.155987,0.835134,0.542453,0.489334,0.224688,0.181129,-0.376025,0.027614,-0.032875,-0.031791,-0.041583,0.027684,0.013094,0.528665
feature13,0.008712,0.189894,0.079408,0.405603,-0.125753,0.316297,1.0,0.247053,0.107331,0.344003,0.389709,0.063676,0.025208,0.272564,0.206816,0.112992,-0.047733,0.28744,0.265579,0.252262,0.365192,0.358648,0.163029,0.106431,-0.093626,0.011753,0.079961,-0.011802,-0.026725,-0.005676,-0.001658,0.481259
feature16,-0.06584,0.127721,0.188044,0.221423,-0.039241,0.243144,0.247053,1.0,-0.516091,0.473951,0.170218,0.658365,0.061983,0.047464,0.015542,-0.098903,-0.069804,0.051121,0.249815,0.145099,0.227966,0.26601,0.221892,0.072086,-0.093458,0.003826,0.063286,0.037358,0.001282,-0.016319,0.018243,0.39199
feature17,-0.145217,0.163152,-0.008621,0.324043,-0.134059,0.158148,0.107331,-0.516091,1.0,0.438902,0.240896,-0.428845,-0.091386,0.29923,-0.054437,0.158393,0.023286,0.239462,0.041808,0.201268,0.206073,0.187928,-0.00715,0.129023,0.009574,0.018423,-0.000814,-0.047965,-0.035751,0.03858,-0.051567,0.225381
feature18,-0.238489,0.326199,0.231442,0.548815,-0.165907,0.399804,0.344003,0.473951,0.438902,1.0,0.424152,0.293757,0.008099,0.334301,-0.059678,0.055035,-0.063388,0.282115,0.321725,0.335208,0.446782,0.474879,0.249244,0.212929,-0.075349,0.010329,0.102715,0.012587,-0.042308,0.017602,-0.025002,0.640284


#### Here the value of corelation(r) determines if the amount and the type of correlation we have with the variables and the target variable ( r = 0, neutral correlation) ( r > 0, negative correlation) and ( r < 0, Positive correlation)
* We have feature 10 with correlation of 0.79
* We have feature 18 with correlation of 0.64
* We have feature 21 with correlation of 0.72
* We have feature 34 with correlation of 0.65
* We have feature 35 with correlation of 0.63

### The strongest correlated variable is feature 10 with correlation of 0.79

## Thank you