### Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.decomposition import PCA

from xgboost import XGBRegressor

### Import Training and Testing datasets

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_train.dtypes.unique()

array([dtype('int64'), dtype('float64'), dtype('O')], dtype=object)

### Removing Columns with Zero Variance

In [3]:
remove_columns = []
for column in df_train.columns:
    if df_train[column].dtype != 'object':
        if df_train[column].var() == 0:
            remove_columns.append(column)

# rzv stands for removed zero variance
df_train_rzv = df_train.drop(remove_columns,axis=1)   
df_test_rzv = df_test.drop(remove_columns,axis=1)    

### Checking for Null Values for Test and Train Sets

In [4]:
# function to check for null values
def check_null_values(pandas_df):
    column_count = 0
    for column in pandas_df.columns:
        if pandas_df[column].isnull().any() == True:
            print(column)
        else:
            column_count+1
            
    return column_count
            
# check null values for training data
print(check_null_values(df_train_rzv),'columns have null values for training data.')

# check null values for test data
print(check_null_values(df_test_rzv),'columns have null values for test data.')

# print(column_count,'columns have null values.')

0 columns have null values for training data.
0 columns have null values for test data.


### Checking for Unique Values for Train and Test Sets

In [5]:
# function to check for unique values
def check_unique_values(pandas_df):
    unique_values = []
    unique_values_count = []
    data_type = []
    df_index = []
    for column in pandas_df.columns:
        unique_values.append(str(pandas_df[column].unique()))
        unique_values_count.append(pandas_df[column].nunique())
        df_index.append(column)
        data_type.append(pandas_df[column].dtype)

    df_unique_values = pd.DataFrame(unique_values,index = df_index)
    df_unique_values.columns = ['Unique Values']
    df_unique_values['Unique Value Count'] = unique_values_count
    df_unique_values['Data Type'] = data_type
    df_unique_values.sort_values(by = 'Unique Value Count', ascending = False)
    
    return df_unique_values

In [6]:
# unique classes of training data
df_train_unique_values = check_unique_values(df_train_rzv)
print(df_train_unique_values.head(10))

                                        Unique Values  Unique Value Count  \
ID                [   0    6    7 ... 8412 8415 8417]                4209   
y     [130.81  88.53  76.26 ...  85.71 108.77  87.48]                2545   
X0  ['k' 'az' 't' 'al' 'o' 'w' 'j' 'h' 's' 'n' 'ay...                  47   
X1  ['v' 't' 'w' 'b' 'r' 'l' 's' 'aa' 'c' 'a' 'e' ...                  27   
X2  ['at' 'av' 'n' 'e' 'as' 'aq' 'r' 'ai' 'ak' 'm'...                  44   
X3                      ['a' 'e' 'c' 'f' 'd' 'b' 'g']                   7   
X4                                  ['d' 'b' 'c' 'a']                   4   
X5  ['u' 'y' 'x' 'h' 'g' 'f' 'j' 'i' 'd' 'c' 'af' ...                  29   
X6  ['j' 'l' 'd' 'h' 'i' 'a' 'g' 'c' 'k' 'e' 'f' 'b']                  12   
X8  ['o' 'x' 'e' 'n' 's' 'a' 'h' 'p' 'm' 'k' 'd' '...                  25   

   Data Type  
ID     int64  
y    float64  
X0    object  
X1    object  
X2    object  
X3    object  
X4    object  
X5    object  
X6    object  
X8

In [7]:
# unique classes of testing data
df_test_unique_values = check_unique_values(df_test_rzv)
print(df_test_unique_values.head(10))

                                         Unique Values  Unique Value Count  \
ID                 [   1    2    3 ... 8413 8414 8416]                4209   
X0   ['az' 't' 'w' 'y' 'x' 'f' 'ap' 'o' 'ay' 'al' '...                  49   
X1   ['v' 'b' 'l' 's' 'aa' 'r' 'a' 'i' 'p' 'c' 'o' ...                  27   
X2   ['n' 'ai' 'as' 'ae' 's' 'b' 'e' 'ak' 'm' 'a' '...                  45   
X3                       ['f' 'a' 'c' 'e' 'd' 'g' 'b']                   7   
X4                                   ['d' 'b' 'a' 'c']                   4   
X5   ['t' 'b' 'a' 'z' 'y' 'x' 'h' 'g' 'f' 'j' 'i' '...                  32   
X6   ['a' 'g' 'j' 'l' 'i' 'd' 'f' 'h' 'c' 'k' 'e' 'b']                  12   
X8   ['w' 'y' 'j' 'n' 'm' 's' 'a' 'v' 'r' 'o' 't' '...                  25   
X10                                              [0 1]                   2   

    Data Type  
ID      int64  
X0     object  
X1     object  
X2     object  
X3     object  
X4     object  
X5     object  
X6     object

In [8]:
df_train_dtypes = pd.DataFrame({'data_types': df_train_rzv.dtypes, 'count data_types': 1})
df_train_dtypes.groupby(by = 'data_types').sum()

Unnamed: 0_level_0,count data_types
data_types,Unnamed: 1_level_1
int64,357
float64,1
object,8


In [9]:
# checking unique classes for only object data type
df_test_unique_values[df_test_unique_values['Data Type'] == 'object']

Unnamed: 0,Unique Values,Unique Value Count,Data Type
X0,['az' 't' 'w' 'y' 'x' 'f' 'ap' 'o' 'ay' 'al' '...,49,object
X1,['v' 'b' 'l' 's' 'aa' 'r' 'a' 'i' 'p' 'c' 'o' ...,27,object
X2,['n' 'ai' 'as' 'ae' 's' 'b' 'e' 'ak' 'm' 'a' '...,45,object
X3,['f' 'a' 'c' 'e' 'd' 'g' 'b'],7,object
X4,['d' 'b' 'a' 'c'],4,object
X5,['t' 'b' 'a' 'z' 'y' 'x' 'h' 'g' 'f' 'j' 'i' '...,32,object
X6,['a' 'g' 'j' 'l' 'i' 'd' 'f' 'h' 'c' 'k' 'e' 'b'],12,object
X8,['w' 'y' 'j' 'n' 'm' 's' 'a' 'v' 'r' 'o' 't' '...,25,object


### Identify the Output Variable

In [10]:
for column in df_train.columns:
    if column not in df_test.columns:
        y_train_column = column

In [11]:
X_train = df_train_rzv.drop('y',axis=1)
y_train = df_train_rzv[y_train_column]

In [12]:
# checking for unique data types in X_train
print(X_train.dtypes.unique())

[dtype('int64') dtype('O')]


In [13]:
# extracting object data type column names 
X_train_object_columns = []
for i in X_train.columns:
    if X_train[i].dtype == 'object':
        X_train_object_columns.append(i)
print(X_train_object_columns)      

['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']


### Checking for Null Values Sum

In [14]:
# total sum of null values
X_train.isna().sum().sum()

0

### Encoding Categorical Variables

In [15]:
le = LabelEncoder()

# combine train and test data before applying label encoder
combined_df = pd.concat([X_train,df_test_rzv],axis = 0)

for column_name in X_train_object_columns:
    combined_df[column_name] = le.fit_transform(combined_df[column_name])
    
# split back into X_train and X_test
X_train = combined_df.iloc[:X_train.shape[0],:]
X_test = combined_df.iloc[X_train.shape[0]:,:]

### Standardizing Data

In [16]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)

X_test = sc.transform(X_test)

### Applying PCA (Dimensionality Reduction)

In [17]:
pca = PCA(n_components = 175)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
print(pca.explained_variance_ratio_.sum())

0.9773469401524962


### xgboost Regressor Model

In [18]:
# create xgboost estimator
xgb = XGBRegressor(max_depth = 2)

# fit xgboost training data to model
xgb.fit(X_train,y_train)

# predict xgboost testing data
y_pred = xgb.predict(X_test)