In [112]:
import pandas as pd
pd.set_option('display.max_columns', None) # See all dataframe columns

In [113]:
df = pd.read_csv('Data//credit_data.csv')

In [114]:
df.describe()

Unnamed: 0,clientid,income,age,loan,default
count,2000.0,2000.0,1997.0,2000.0,2000.0
mean,1000.5,45331.600018,40.807559,4444.369695,0.1415
std,577.494589,14326.327119,13.624469,3045.410024,0.348624
min,1.0,20014.48947,-52.42328,1.37763,0.0
25%,500.75,32796.459717,28.990415,1939.708847,0.0
50%,1000.5,45789.117313,41.317159,3974.719419,0.0
75%,1500.25,57791.281668,52.58704,6432.410625,0.0
max,2000.0,69995.685578,63.971796,13766.051239,1.0


In [115]:
df.loc[df['age'] <= 0] # loc == Locator

Unnamed: 0,clientid,income,age,loan,default
15,16,50501.726689,-28.218361,3977.287432,0
21,22,32197.620701,-52.42328,4244.057136,0
26,27,63287.038908,-36.496976,9595.286289,0


In [64]:
# First solution - Drop the column
df.drop(['age'], axis=1, inplace=True) # axis = 1 -> Drop the entire column

In [73]:
# Second solution - Drop the rows with problem
df.drop(df[df['age'] <= 0].index, inplace=True)

In [75]:
# Third solution - Fill with media
df['age'][df['age'] > 0].mean()

40.92770044906149

In [120]:
df.loc[df['age'] < 0, 'age'] = 40.92 # age = field that will update

In [121]:
df.loc[pd.isnull(df['age'])] # Returns rows with null age

Unnamed: 0,clientid,income,age,loan,default
28,29,59417.805406,,2082.625938,0
30,31,48528.852796,,6155.78467,0
31,32,23526.302555,,2862.010139,0


In [125]:
features = df.iloc[:, 1:4].values # df.iloc[:, 1:4] -> All rows and 1, 2, 3 columns

In [126]:
target = df.iloc[:, 4].values # df.iloc[:, 4] -> All rows and 1, 2, 3 columns

In [127]:
from sklearn.preprocessing import Imputer # Puts an input value in nan values
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) # See the documentation with 'help(Imputer)'
imputer = imputer.fit(features[:,:]) # The fit method makes the object fit the dataset X
features[:,:] = imputer.transform(features[:,:])
# Could be replaced by: features[columnName].fillna(new_dtrain[columnName].mean(), inplace=True)
# In this case we do one by one



In [129]:
# Scaling variables - Important for algorithms that use Euclidean distance
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
features= scaler.fit_transform(features)