In [None]:
%%writefile boston_house_price_prediction.py


# Step -1 - Import Packages
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
import seaborn as sns
from sklearn import metrics
plt.rcParams["figure.figsize"] = (10, 10)



# Step - 2 - Define the main function
def main():
    # Get data

    ### To Do Assignment: try changing the data from Boston housing to California housing dataset
    ### You can load the datasets as follows::
    ###    from sklearn.datasets import fetch_california_housing
    ###    housing = fetch_california_housing()
    ###  Refer this link for more detatils: https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html


    # load the regression dataset
    # Load the California housing dataset
    from sklearn.datasets import fetch_california_housing
    california_data = fetch_california_housing()

    # Separate the features and target variable
    california_X =pd.DataFrame(california_data.data, columns=california_data.feature_names)
    california_y = california_data.target
    features = california_data.feature_names

    # check the data has loaded successfully
    print(california_X.shape)
    print(california_y.shape)

    ## Data Exploration
    print(f'The features in dataset are: {features}')
    #print(f'Data description\n {california_X.describe()}')

    #Plots
    plot_data(california_X, california_y, features, cor=True)

    ## Remove Outliers
    california_X, california_y = remove_outliers(california_X,california_y, features)

    X_train, y_train, X_test, y_test = preprocess(california_X, california_y, features)

    model = SVR()
    model = train(model, X_train, y_train)
    evaluate(model, X_test, y_test, bl= True)


    # Get the best params using the optimizer based on SVR
    # best_params = optimize_models(X_train, y_train)
    #print(best_params)
    ## Build Best Model
    ##best_C= best_params['C']
    #best_kernel = best_params['kernel']
    #best_model = SVR(kernel = best_kernel, C= best_C)
    #best_model = train(best_model, X_train, y_train)
    #evaluate (best_model, X_test, y_test)

    ### To Do Assignment Change the model to MLP  and accordingly change Grid search params
    #Get the best params using the optimizer based on MLP regressor model from Grid Search

    best_params = optimize_models(X_train, y_train)
    print(best_params)
    best_params.fit(X_train, y_train)
    best_model = best_params.best_estimator_
    # Retrain the best model with optimal hyperparameters
    best_model = train(best_model, X_train, y_train)

    evaluate (best_model, X_test, y_test)


# Step - 3 - Plot graphs to understand data
def plot_data(x_df, y_df,features, cor=False):
    X = x_df.values
    plt.figure(figsize=(10,10))
    plt.title("Price Distribution")
    plt.hist(y_df, bins=30)
    plt.show()
    #cols = x_df.columns()
    fig, ax = plt.subplots(1, len(features), sharey=True, figsize=(20,5))
    plt.title("Relationship between different input features and price")
    ax = ax.flatten()
    for i, col in enumerate(features):
        x = X[:,i]
        y = y_df
        ax[i].scatter(x, y, marker='o')
        ax[i].set_title(col)
        ax[i].set_xlabel(col)
        ax[i].set_ylabel('MEDV')
    plt.show()

    if cor:
      ### To Do Add the code to find and display correlation among
      ### different features

      # Create a DataFrame using the preprocessed features
      df = pd.DataFrame(x_df, columns=features)
      df['Target'] = y

      # Calculate the correlation matrix
      correlation_matrix = df.corr()

      # Sort the correlation values with the target variable in descending order
      correlation_with_target = correlation_matrix['Target'].abs().sort_values(ascending=False)

      k = len(features)
      selected_features = correlation_with_target[1:k+1].index.tolist()

      print("Features in descending order of correlation:", selected_features)


# Step - 4 - Preprocess data
# Step -4a : Remove outliers
def remove_outliers(x,y, features):
    #remove null
    x_df = x.copy(deep=True)
    x_df['MEDV'] = y
    x_df.dropna(inplace=True)
    return x_df[features], x_df['MEDV']


# Step -4b : Normalize data
def scale_numeric(df):
    x = df.values
    #scaler = preprocessing.StandardScaler()
    ### To Do Assignment instead of StandardScaler use MinMaxScaler,
    ### Also observe if scaling influences the results
    scaler = preprocessing.MinMaxScaler()
    x_scaled = scaler.fit_transform(x)
    df = pd.DataFrame(x_scaled)
    return df


# Step -4b : Preprocess data
def preprocess(x, y, features):
    x_df = x[features].copy(deep=True)
    x_df = scale_numeric(x_df)
    #print(len(x_df),len(y))
    # Split data into train, test
    X_train, X_test, y_train, y_test = train_test_split(x_df,y, test_size=0.3, random_state=1)
    return X_train, y_train, X_test, y_test


# Step - 5 - train model
def train(model,X_train, y_train):
    model.fit(X_train, y_train)
    return model


# Step - 6 - Evaluate Model
def evaluate(model, X_test, y_test, plot = True, print_results=True, bl=False):
    y_pred = model.predict(X_test)
    if print_results:
      if bl:
        print('\n\nBaseline Model Performance on Test Dataset:\n')
      else:
        print('\n\nBest Model Performance on Test Dataset:\n')
      print('R^2:',metrics.r2_score(y_test, y_pred))
      print('MAE:',metrics.mean_absolute_error(y_test, y_pred))
      print('MSE:',metrics.mean_squared_error(y_test, y_pred))
      print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

    if plot:
      plt.scatter(y_test, y_pred)
      plt.xlabel("Prices")
      plt.ylabel("Predicted prices")
      plt.title("Prices vs Predicted prices")
      plt.show()
    return




# Step - 7 - Improve Model
def optimize_models(X_train, y_train):

  #params = {'kernel':['linear', 'rbf'], 'C':[1, 10]}
  #model = SVR()
  #clf = GridSearchCV(model, params)

   ### To Do Assignment Change the model to MLP  and accordingly change Grid search params
   param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (100, 100)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'lbfgs'],
    'alpha': [0.0001, 0.001, 0.01]
   }

   # Create the MLP regressor model
   # Got the error - ConvergenceWarning: lbfgs failed to converge (status=1): STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
   # Raised max_iterations from to 1000,2000,3000 , used different data scalers, but no success
   model = MLPRegressor(random_state=42, max_iter=3000)

   # Perform Grid Search to find the best hyperparameters
   grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=3)

   return (grid_search)



# call the main finction
if __name__ == '__main__':
    main()




Overwriting boston_house_price_prediction.py


In [None]:
%run boston_house_price_prediction.py

In [None]:
%%writefile cancer_detection.py


# Step -1 - Import Package
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import seaborn as sns
from sklearn import metrics
plt.rcParams["figure.figsize"] = (10, 10)



# Step - 2 - Define the main function
def main():
    # Get data
    cancer_data = load_breast_cancer()
    cancer_data_X = pd.DataFrame(cancer_data.data, columns = cancer_data.feature_names)
    cancer_data_y = cancer_data.target
    features = cancer_data.feature_names

    vars = ['mean radius', 'mean texture', 'mean area', 'mean perimeter', 'mean smoothness']
    ## Data Exploration
    print(f'The features in dataset are: {features}')
    #print(f'Data description\n {cancer_data_X.describe()}')

    #Plots
    plot_data(cancer_data_X, cancer_data_y, features= vars, cor=True)

    ## Remove Outliers
    cancer_data_X, cancer_data_y = remove_outliers(cancer_data_X,cancer_data_y, features)

    X_train, y_train, X_test, y_test = preprocess(cancer_data_X, cancer_data_y, features)

    model = SVC(random_state=6)

    model = train(model, X_train, y_train)

    baseline = evaluate(model, X_test, y_test, bl=True)

    best_params = optimize_models(X_train, y_train)
    print(best_params)

    ## Build Best Model
    best_C= best_params['C']
    best_kernel = best_params['kernel']

    best_model = SVC(kernel = best_kernel, C= best_C, random_state=6)
    best_model = train(best_model, X_train, y_train)
    evaluate (best_model, X_test, y_test)





# Step - 3 - Plot graphs to understand data
def plot_data(x_df, y_df,features, cor=False):
    X = x_df.copy(deep=True)
    X['class'] = y_df
    sns.pairplot(X, hue = 'class', vars = ['mean radius', 'mean texture', 'mean area', 'mean perimeter', 'mean smoothness'] )
    plt.show()

    if cor:
      corr = X[features].corr()
      plt.figure(figsize=(10,10))
      sns.heatmap(corr, cbar=True, square= True, fmt='.1f', annot=True, annot_kws={'size':15}, cmap='Greens')
      plt.show()






# Step - 4 - Preprocess data
# Step -4a : Remove outliers
def remove_outliers(x,y, features):
    #remove null
    x_df = x.copy(deep=True)
    x_df['class'] = y
    x_df.dropna(inplace=True)
    return x_df[features], x_df['class']


# Step -4b : Normalize data
def scale_numeric(df):
    x = df.values
    scaler = preprocessing.StandardScaler()
    x_scaled = scaler.fit_transform(x)
    df = pd.DataFrame(x_scaled)
    return df



# Step -4b : Preprocess data
def preprocess(x, y, features):
    x_df = x[features].copy(deep=True)
    x_df = scale_numeric(x_df)
    #print(len(x_df),len(y))
    # Split data into train, test
    X_train, X_test, y_train, y_test = train_test_split(x_df,y, test_size=0.3, random_state=45)
    return X_train, y_train, X_test, y_test




# Step - 5 - train model
def train(model,X_train, y_train):
    model.fit(X_train, y_train)
    return model


# Step - 6 - Evaluate Model
def evaluate(model, X_test, y_test, plot = True, print_results=True, bl=False):
    y_pred = model.predict(X_test)
    cm = metrics.confusion_matrix(y_test, y_pred)
    acc = metrics.accuracy_score(y_test, y_pred)
    if print_results:
      if bl:
        print('\n\nBaseline Model Performance on Test Dataset:\n')
      else:
        print('\n\nBest Model Performance on Test Dataset:\n')
      print('\nConfusion Matrix:\n',cm)
      print(f'Accuracy: {acc*100}%')

    if plot:
      sns.heatmap(cm, annot= True)
      plt.show()
    return




# Step - 7 - Improve Model
def optimize_models(X_train, y_train):
  params = {'kernel':['rbf'], 'C':[1.0, 5.0, 10]}
  model = SVC(random_state=5)
  clf = GridSearchCV(model, params)
  clf.fit(X_train, y_train)
  return clf.best_params_


# call the main finction
if __name__ == '__main__':
    main()




In [None]:
%run cancer_detection.py