**Importing and Studying the dataset**

In [None]:
#Importing necessary libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data viz

In [None]:
# Reading the data
training_data = pd.read_csv('../input/train.csv')
testing_data = pd.read_csv('../input/test.csv')

In [None]:
training_data.head(n=20)

In [None]:
testing_data.head(n=20)

In [None]:
training_data.info()

In [None]:
testing_data.info()

<font size=5>**Exploring the target variable**</font>

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(range(training_data.shape[0]), np.sort(training_data['target'].values))
plt.xlabel('index', fontsize=12)
plt.ylabel('Target', fontsize=12)
plt.title("Target Distribution", fontsize=14)
plt.show()

We can see that most of the training data has quite quite a low target value which would make sense since wealth is divided unevenly to the top percentiles. According to Kaggle has used the RMSLE metric which would help to normalise any extreme differences from the mean.

<font size=5>**Data Preprocessing**</font>

Checking for missing data

In [None]:
missing_df = training_data.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']
missing_df = missing_df[missing_df['missing_count']>0]
missing_df = missing_df.sort_values(by='missing_count')
print(missing_df)





Columns with only one unique value is not useful for us at seeing the predictability of that variable so we'll drop them.

In [None]:
df_tmp=pd.DataFrame(training_data.nunique().sort_values(),columns=['num_unique_values']).reset_index().rename(columns={'index':'Column_name'})
df_tmp.head()

In [None]:
def col_name_with_n_unique_value(df,n):
    df1=pd.DataFrame(df.nunique().sort_values(),columns=['num_unique_values']).reset_index()
    col_name=list(df1[df1.num_unique_values==1]['index'])
    print('number of columns with only',n,'unique values are: ',len(col_name))
    return col_name

In [None]:
col_to_drop=col_name_with_n_unique_value(training_data,1)

In [None]:
training_data.drop(columns=col_to_drop,inplace=True)
testing_data.drop(columns=col_to_drop,inplace=True)
print('Shape of train dataset after droping columns: ',training_data.shape)
print('Shape of test dataset after droping columns: ',testing_data.shape)

In [None]:
train=training_data.iloc[:,2:].values
test=testing_data.iloc[:,1:].values
target=training_data.target.values
print('Shape of train: ',train.shape)
print('Shape of target: ',target.shape)
print('Shape of test: ',test.shape)

<font size=5>**Splitting our data into training and validation**</font>

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size=0.2, random_state=42)
y_train = np.log1p(y_train)
y_val = np.log1p(y_val)

print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(test.shape, test.shape)

<font size=5>**Modelling**</font>

We'll be looking at random forest, xgb and light gbm and averaging the scores from them

In [None]:
# Library and modules for modelling
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import lightgbm

In [None]:
rf_model = RandomForestRegressor(n_estimators=100)
rf_model.fit(X_train, y_train)
model_name='RandomForestRegressor'
RMSLE=np.sqrt(mean_squared_error(y_val,rf_model.predict(X_val)))
Model_Summary=pd.DataFrame()
Model_Summary=Model_Summary.append({'Model_Name':model_name,'RMSLE':RMSLE},ignore_index=True)
print(Model_Summary)

In [None]:
rf_pred=np.expm1(rf_model.predict(test))
print(rf_pred)

<font size=5>**XG Boost**</font>

In [None]:
xgb_model=XGBRegressor (max_depth=9)
xgb_model.fit(X_train, y_train)
model_name='xgboost'
RMSLE=np.sqrt(mean_squared_error(y_val,xgb_model.predict(X_val)))
Model_Summary=Model_Summary.append({'Model_Name':model_name,'RMSLE':RMSLE},ignore_index=True)
print(Model_Summary)

In [None]:
xgb_pred=np.expm1(xgb_model.predict(test))
print(xgb_pred)

<font size=5>**Light GBM**</font>

In [None]:
train_data=lightgbm.Dataset(X_train,y_train)
valid_data=lightgbm.Dataset(X_val,y_val)
params={'learning_rate':0.01,
        'boosting_type':'gbdt',
        'objective':'regression',
        'metric':'rmse',
        'sub_feature':0.5,
        'num_leaves':180,
        'feature_fraction': 0.5,
        'bagging_fraction': 0.5,
        'min_data':50,
        'max_depth':-1,
        'reg_alpha': 0.3, 
        'reg_lambda': 0.1, 
        'min_child_weight': 10, 
        'verbose': 1,
        'nthread':5,
        'max_bin':512,
        'subsample_for_bin':200,
        'min_split_gain':0.0001,
        'min_child_samples':5
       }
lgbm = lightgbm.train(params,
                 train_data,
                 25000,
                 valid_sets=valid_data,
                 early_stopping_rounds= 80,
                 verbose_eval= 10
                 )
model_name='lightgbm'
RMSLE=np.sqrt(mean_squared_error(y_val,lgbm.predict(X_val)))
Model_Summary=Model_Summary.append({'Model_Name':model_name,'RMSLE':RMSLE},ignore_index=True)
print(Model_Summary)

In [None]:
lgbm_pred=np.expm1(lgbm.predict(test))
print(lgbm_pred)

<font size=5>**Averaging scores and getting the submission file**</font>

In [None]:
sub=pd.read_csv('../input/sample_submission.csv')
sub.target=(lgbm_pred+rf_pred+xgb_pred)/3.0
sub.head()


In [None]:
sub.to_csv('submission.csv',index=False)