## Uploading files from your local file system

`files.upload` returns a dictionary of the files which were uploaded.
The dictionary is keyed by the file name, the value is the data which was uploaded.

In [0]:
# from google.colab import files

# uploaded = files.upload()

# for fn in uploaded.keys():
#   print('User uploaded file "{name}" with length {length} bytes'.format(
#       name=fn, length=len(uploaded[fn])))

## Import dependencies

In [0]:
import pandas as pd
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error


## Import data

In [0]:
dengue_features_train = pd.read_csv("dengue_features_train.csv")
dengue_label_train = pd.read_csv("dengue_labels_train.csv")
dengue_features_test = pd.read_csv("dengue_features_test.csv")

## Data Exploration

In [0]:
# Check training data
dengue_features_train.head()

In [0]:
# Check test data
dengue_features_test.head()

In [0]:
# Check training labels
dengue_label_train.head()

In [0]:
# Explore the training features
dengue_features_train.info()

In [0]:
# Check basic statistics of the numerics columns
dengue_features_train.describe()

In [0]:
# Check basic statistics of non-numeric columns
dengue_features_train.describe(include= 'O')

In [0]:
print("Train data shape: ", dengue_features_train.shape)
print("Test data shape: ", dengue_features_test.shape)

## Feature Engineering

In [0]:
# Determine feature matrix
features_to_use = dengue_features_train.drop(['week_start_date'], axis= 1).columns

# Define X
X = dengue_features_train[features_to_use]

# Define y
y = dengue_label_train.total_cases

In [0]:
len(y)

In [0]:
# Handle categorical variable
X_encoded = pd.get_dummies(X, columns=['city'])

In [0]:
# Check missing columns in training data
missing_cols = [col for col in X_encoded.columns if X_encoded[col].isnull().any()]
missing_cols

In [0]:
# Handling missing data
imputer = Imputer()
X_encoded_imputed = imputer.fit_transform(X_encoded)
X_encoded_imputed

## Data model

In [0]:
# Split the training data
X_train, X_val, y_train, y_val = train_test_split(X_encoded_imputed, y, test_size= 0.2, random_state= 1)


# Define the model
model_rf = RandomForestRegressor(n_jobs= -1, random_state= 1)


# Fit the model
model_rf.fit(X_train, y_train)


In [0]:
# Predict the validation data
y_pred = model_rf.predict(X_val)

# Check the performance
mean_absolute_error(y_val, y_pred)

In [0]:
# Create the model based on all training data
final_model = RandomForestRegressor(random_state= 1, n_jobs= -1)
final_model.fit(X_encoded_imputed, y)

In [0]:
# Modify test data
X_test = dengue_features_test[features_to_use]
X_test_encoded = pd.get_dummies(X_test, columns= ['city'])
X_test_encoded_imputed = imputer.transform(X_test_encoded)

In [0]:
# Predict the y value in test data
y_final = final_model.predict(X_test_encoded_imputed)

In [0]:
# Create submission file
output = pd.DataFrame({"city": dengue_features_test.city,
                       "year": dengue_features_test.year,
                       "weekofyear": dengue_features_test.weekofyear,
                       "total_cases": y_final})
output.to_csv("submission_rf.csv", index= False)
files.download("submission_rf.csv")