## Import Libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

## Import data

In [None]:
data = pd.read_csv('', sep=' ', names=['col1', 'col2', 'col3'])
# sep = Delimter of how columns are sperated. Default is ','. Change it if the data is not csv
# names = List of names for the columns if the data file does not have them

## Inspect data

In [None]:
data.describe()
# Check for the variation in count. If there are missing values, the count of some columns will be lesser than the other columns

## Imputer usage

In [None]:
# Convert data to int type from object type and fill the NaN with mean
data['Bare Nuclei']  = pd.to_numeric(data['Bare Nuclei'], errors='coerce', downcast='integer')
data['Bare Nuclei'].fillna(data['Bare Nuclei'].mean(), inplace=True)

## OneHotEncode categorical column if needed
Call the below method with a list of columns to be one hot encoded. E.g.

In [None]:
features = [] # To be encoded columns
for feature in features:
    train = encode_cat_columns(train, feature)

In [None]:
def encode_cat_columns(data, col_name):
    labelencoder = LabelEncoder()
    data[col_name] = labelencoder.fit_transform(data[col_name])
    onehotencoder = OneHotEncoder()
    arroneHot = onehotencoder.fit_transform(data[col_name].values.reshape(-1,1)).toarray()
    dfOneHot = pd.DataFrame(arroneHot, columns = [col_name+"_"+str(int(i)) for i in range(arroneHot.shape[1])])
    # Drop the original column replaced by encoded columns
    data = data.drop(col_name, axis=1)
    data = pd.concat([data, dfOneHot], axis=1)
    return data

## Split data into train (70%) and test (30%)

In [None]:
#Seperate data into X and y (result column)
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

## Try a simple model to serve as base accurracy value. Further improved in future steps.

#### If classification, try DecisionTreeClassifier

In [None]:
decTree = DecisionTreeClassifier()
decTree.fit(X_train, y_train)
y_pred = decTree.predict(X_test)

#### If regression, try DecisionTreeClassifier

In [None]:
linReg = LinearRegression()
linReg.fit(X_train, y_train)
y_pred = linReg.predict(X_test)

#### Caclulating accurracy for classification

In [None]:
acc = accuracy_score(y_test, y_pred)
print('Accurracy: {:.4%}'.format(acc))

#### Caclulating accurracy for regression

In [None]:
print('mean_absolute_error: {:5.4f}'.format(mean_absolute_error(y_test, y_pred)))

#### Calculating recall/precision

In [None]:
confusion_matrix(y_true, y_pred)

## Plots

### Heatmap Analysis: Drop columns with low correlation to output column
Note: Categorical columns are not taken into account here. Visualize them seperately

In [None]:
corr = data.corr()
plt.subplots(figsize=(30, 30))
cmap = sns.diverging_palette(150, 250, as_cmap=True)
sns.heatmap(corr, cmap="RdYlBu", vmax=1, vmin=-0.6, center=0.2, square=True, linewidths=0, cbar_kws={"shrink": .5}, annot = True);

In [None]:
def get_features(correlation_threshold):
    abs_corrs = corr.abs()
    high_correlations = abs_corrs[abs_corrs > correlation_threshold].index
    return high_correlations

In [None]:
features = get_features(0.05) 
data = data[features] 
# In the above process, we drop columns with correlation value less than 0.05

### Plots

In [None]:
#Plot a categorical column values against each class in output
sns.catplot(x="Status", kind="count", hue="Customer Class", data=data);
#Plot to check how the value of a continuos/numerical column varies with output class
sns.catplot(x="Customer Class", y="Duration (months)", kind="box", data=data);

## Cross Validation

##### For regression

In [None]:
linReg = LinearRegression()
my_pipeline = Pipeline(steps=[('model',linReg)])

# Multiply by -1 since sklearn calculates *negative* MAE
scores = -1 * cross_val_score(my_pipeline, X, y,
                              cv=5,
                              scoring='neg_mean_absolute_error')

print("MAE scores:\n", scores)
print("Avg score:\n", sum(scores)/5)

#### For classification

In [None]:
decTree = DecisionTreeClassifier()
my_pipeline = Pipeline(steps=[('model',decTree)])

# Multiply by -1 since sklearn calculates *negative* MAE
scores = cross_val_score(my_pipeline, X, y,
                              cv=5,
                              scoring='recall_micro')

print("MAE scores:\n", scores)
print("Avg score:\n", sum(scores)/5)