In [1]:
import datetime
import numpy as np
import pandas as pd

In [2]:
data_types = {
    "sexo":str,
    "ind_nuevo":str,
    "ult_fec_cli_1t":str,
    "indext":str,
    'ncodpers': np.int32, 
    'indrel_1mes': 'object', 
    'conyuemp': 'object', 
    'ind_ahor_fin_ult1': np.int8, 
    'ind_aval_fin_ult1': np.int8, 
    'ind_cco_fin_ult1': np.int8, 
    'ind_cder_fin_ult1': np.int8, 
    'ind_cno_fin_ult1': np.int8, 
    'ind_ctju_fin_ult1': np.int8, 
    'ind_ctma_fin_ult1': np.int8, 
    'ind_ctop_fin_ult1': np.int8, 
    'ind_ctpp_fin_ult1': np.int8, 
    'ind_deco_fin_ult1': np.int8, 
    'ind_deme_fin_ult1': np.int8, 
    'ind_dela_fin_ult1': np.int8, 
    'ind_ecue_fin_ult1': np.int8, 
    'ind_fond_fin_ult1': np.int8, 
    'ind_hip_fin_ult1': np.int8, 
    'ind_plan_fin_ult1': np.int8, 
    'ind_pres_fin_ult1': np.int8, 
    'ind_reca_fin_ult1': np.int8, 
    'ind_tjcr_fin_ult1': np.int8, 
    'ind_valo_fin_ult1': np.int8, 
    'ind_viv_fin_ult1': np.int8, 
    'ind_recibo_ult1': np.int8
}

dates = ['fecha_dato', 'fecha_alta', 'ult_fec_cli_1t']
nas = ['NA', ' NA', '     NA', 'NaN', '         NA']

In [3]:
data = pd.read_csv('data/test_ver2.csv', parse_dates=dates, dtype=data_types, na_values=nas)

In [5]:
data.indfall.unique()

array(['N', 'S'], dtype=object)

In [4]:
data.loc[data.age < 18,"age"] = data.loc[(data.age >= 18) & (data.age <= 30),"age"].mean(skipna=True)
data.loc[data.age > 100,"age"] = data.loc[(data.age >= 30) & (data.age <= 100),"age"].mean(skipna=True)
data["age"].fillna(data["age"].mean(),inplace=True)
data["age"] = data["age"].astype(int)

In [5]:
data.loc[data["ind_nuevo"].isnull(),"ind_nuevo"] = 1

In [6]:
data.loc[data.antiguedad.isnull(), "antiguedad"] = data.antiguedad.min()
data.loc[data.antiguedad < 0, "antiguedad"] = 0

In [7]:
dates = data.loc[:,"fecha_alta"].sort_values().reset_index()
median_date = int(np.median(dates.index.values))
data.loc[data.fecha_alta.isnull(),"fecha_alta"] = dates.loc[median_date,"fecha_alta"]

In [8]:
data.loc[data.indrel.isnull(),"indrel"] = 1

In [9]:
data.drop(["tipodom","cod_prov"],axis=1,inplace=True)

In [10]:
data.loc[data.ind_actividad_cliente.isnull(),"ind_actividad_cliente"] = data["ind_actividad_cliente"].median()

In [11]:
data.loc[data.nomprov=="CORU\xc3\x91A, A","nomprov"] = "CORUNA, A"

In [12]:
data.loc[data.nomprov.isnull(),"nomprov"] = "UNKNOWN"

In [13]:
grouped        = data.groupby("nomprov").agg({"renta":lambda x: x.median(skipna=True)}).reset_index()
new_incomes    = pd.merge(data,grouped,how="inner",on="nomprov").loc[:, ["nomprov","renta_y"]]
new_incomes    = new_incomes.rename(columns={"renta_y":"renta"}).sort_values("renta").sort_values("nomprov")
data.sort_values("nomprov",inplace=True)
data             = data.reset_index()
new_incomes    = new_incomes.reset_index()

In [14]:
data.loc[data.renta.isnull(),"renta"] = new_incomes.loc[data.renta.isnull(),"renta"].reset_index()
data.loc[data.renta.isnull(),"renta"] = data.loc[data.renta.notnull(),"renta"].median()
data.sort_values(by="fecha_dato",inplace=True)

In [15]:
data.loc[data.ind_nomina_ult1.isnull(), "ind_nomina_ult1"] = 0
data.loc[data.ind_nom_pens_ult1.isnull(), "ind_nom_pens_ult1"] = 0

AttributeError: 'DataFrame' object has no attribute 'ind_nomina_ult1'

In [18]:
string_data = data.select_dtypes(include=["object"])
missing_columns = [col for col in string_data if string_data[col].isnull().any()]

In [19]:
data.loc[data.indfall.isnull(),"indfall"] = "N"
data.loc[data.tiprel_1mes.isnull(),"tiprel_1mes"] = "A"
data.tiprel_1mes = data.tiprel_1mes.astype("category")

# As suggested by @StephenSmith
map_dict = { 1.0  : "1",
            "1.0" : "1",
            "1"   : "1",
            "3.0" : "3",
            "P"   : "P",
            3.0   : "3",
            2.0   : "2",
            "3"   : "3",
            "2.0" : "2",
            "4.0" : "4",
            "4"   : "4",
            "2"   : "2"}

data.indrel_1mes.fillna("P",inplace=True)
data.indrel_1mes = data.indrel_1mes.apply(lambda x: map_dict.get(x,x))
data.indrel_1mes = data.indrel_1mes.astype("category")


unknown_cols = [col for col in missing_columns if col not in ["indfall","tiprel_1mes","indrel_1mes"]]
for col in unknown_cols:
    data.loc[data[col].isnull(),col] = "UNKNOWN"

In [20]:
data.isnull().any()

index                    False
fecha_dato               False
ncodpers                 False
ind_empleado             False
pais_residencia          False
sexo                     False
age                      False
fecha_alta               False
ind_nuevo                False
antiguedad               False
indrel                   False
ult_fec_cli_1t            True
indrel_1mes              False
tiprel_1mes              False
indresi                  False
indext                   False
conyuemp                 False
canal_entrada            False
indfall                  False
nomprov                  False
ind_actividad_cliente    False
renta                    False
segmento                 False
dtype: bool

In [37]:
data.to_csv("data/cleaned_data.csv")

In [24]:
data.ult_fec_cli_1t[data.ult_fec_cli_1t.isnull()] = data.ult_fec_cli_1t[619523]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [25]:
data.to_csv("data/cleaned_test.csv")