# W7 - Statistics in Python.pdf

## iqr

In [2]:
import pandas as pd
import numpy as np

# Load your dataset into a DataFrame
# Replace 'your_dataset.csv' with your actual file path or URL
food_consumption = pd.read_csv('your_dataset.csv')

# Calculate total co2_emission per country: emissions_by_country
emissions_by_country = food_consumption.groupby('country')['co2_emission'].sum()

# Compute the first and third quartiles (25th and 75th percentiles) and IQR of emissions_by_country
q1 = np.percentile(emissions_by_country, 25)
q3 = np.percentile(emissions_by_country, 75)
iqr = q3 - q1

# Calculate the lower and upper cutoffs for outliers
lower = q1 - 1.5 * iqr
upper = q3 + 1.5 * iqr

# Subset emissions_by_country to find outliers
outliers = emissions_by_country[(emissions_by_country < lower) | (emissions_by_country > upper)]

# Print the outliers
print("Outliers:")
print(outliers)

## Calculating probabilities

In [None]:
import pandas as pd

# Load your dataset into a DataFrame
# Replace 'amir_deals.csv' with your actual file path or URL
amir_deals = pd.read_csv('amir_deals.csv')

# Count the deals for each product
counts = amir_deals['product'].value_counts()

# Calculate the probability of picking a deal with each product
probs = counts / amir_deals.shape[0]

# Print the probabilities
print("Probabilities of picking a deal for each product:")
print(probs)

## probability distribution

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load your dataset into a DataFrame
# Replace 'restaurant_groups.csv' with your actual file path or URL
restaurant_groups = pd.read_csv('restaurant_groups.csv')

# Plot a histogram of group sizes
restaurant_groups['group_size'].hist(bins=[2, 3, 4, 5, 6])
plt.show()

# Create probability distribution
size_dist = restaurant_groups['group_size'].value_counts() / restaurant_groups.shape[0]

# Reset index and rename columns
size_dist = size_dist.reset_index()
size_dist.columns = ['group_size', 'prob']

# Print the probability distribution
print("Probability distribution of group sizes:")
print(size_dist)

# Calculate the expected value
expected_value = np.sum(size_dist['group_size'] * size_dist['prob'])
print('The expected value is', expected_value)

# Subset groups of size 4 or more
groups_4_or_more = size_dist[size_dist['group_size'] >= 4]

# Sum the probabilities of groups_4_or_more
prob_4_or_more = np.sum(groups_4_or_more['prob'])
print('The probability of groups with 4 or more people is', prob_4_or_more)


## calculate probabilities and generate random wait times:

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import uniform

min_time = 0
max_time = 30

# Calculate probability of waiting more than 5 mins
prob_greater_than_5 = 1 - uniform.cdf(5, min_time, max_time)
print("Probability of waiting more than 5 mins:", prob_greater_than_5)

# Calculate probability of waiting 10-20 mins
prob_between_10_and_20 = uniform.cdf(20, min_time, max_time) - uniform.cdf(10, min_time, max_time)
print("Probability of waiting between 10 and 20 mins:", prob_between_10_and_20)

# Set random seed for reproducibility
np.random.seed(334)

# Generate 1000 wait times between 0 and 30 mins
wait_times = uniform.rvs(min_time, max_time, size=1000)

# Create a histogram of simulated times and show the plot
plt.hist(wait_times, bins=20)  # You can adjust the number of bins as needed
plt.xlabel("Wait Time (mins)")
plt.ylabel("Frequency")
plt.title("Histogram of Simulated Wait Times")
plt.show()


##generating sample means
from a dataset and creating a histogram using a sample size of 20 and a loop of 100 iterations

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Set random seed for reproducibility
np.random.seed(104)

sample_means = []

# Loop 100 times
for i in range(100):
    # Take a sample of 20 num_users with replacement
    samp_20 = amir_deals['num_users'].sample(20, replace=True)

    # Calculate the mean of samp_20
    samp_20_mean = np.mean(samp_20)

    # Append samp_20_mean to sample_means
    sample_means.append(samp_20_mean)

# Convert to a Series and plot the histogram
sample_means_series = pd.Series(sample_means)
sample_means_series.hist(bins=15)  # You can adjust the number of bins as needed

# Add labels and title
plt.xlabel("Sample Means")
plt.ylabel("Frequency")
plt.title("Histogram of Sample Means")

# Show the plot
plt.show()


## creating a scatterplot, adding a trendline and calculating the correlation between two variables:

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load your dataset into a DataFrame
# Replace 'data=world_happiness' with your actual file path or URL
world_happiness = pd.read_csv('world_happiness.csv')
# Create a scatterplot of happiness_score vs. life_exp
sns.scatterplot(x='life_exp', y='happiness_score', data=world_happiness)
plt.title("Scatterplot of Happiness Score vs. Life Expectancy")
plt.xlabel("Life Expectancy")
plt.ylabel("Happiness Score")
plt.show()

# Create a scatterplot of happiness_score vs. life_exp with a trendline
sns.lmplot(x='life_exp', y='happiness_score', data=world_happiness, ci=None)
plt.title("Scatterplot with Trendline of Happiness Score vs. Life Expectancy")
plt.xlabel("Life Expectancy")
plt.ylabel("Happiness Score")
plt.show()

# Calculate the correlation between life_exp and happiness_score
correlation = world_happiness['life_exp'].corr(world_happiness['happiness_score'])
print("Correlation between Life Expectancy and Happiness Score:", correlation)

# W8 - Regression with Statsmodels (1).pdf

## all from chat

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols

# Taiwan real estate dataset
taiwan_real_estate = pd.read_csv('Taiwan_real_estate2.csv')

# Visualizing two numeric variables
sns.regplot(x="price_twd_msq", y="n_convenience", data=taiwan_real_estate, scatter_kws={'alpha': 0.5}, ci=None)
plt.show()

# Linear regression with ols()
mdl_price_vs_conv = ols('price_twd_msq ~ n_convenience', data=taiwan_real_estate).fit()
print(mdl_price_vs_conv.params)

# Predicting values
explanatory_data = pd.DataFrame({'n_convenience': np.arange(0, 11)})
price_twd_msq = mdl_price_vs_conv.predict(explanatory_data)
prediction_data = explanatory_data.assign(price_twd_msq=price_twd_msq)
print(prediction_data)

# Manually predicting values
coeffs = mdl_price_vs_conv.params
intercept = coeffs[0]
slope = coeffs[1]
price_twd_msq = intercept + slope * explanatory_data['n_convenience']
print(price_twd_msq)
print(price_twd_msq.assign(predictions_auto=mdl_price_vs_conv.predict(explanatory_data)))

# Transforming variables
taiwan_real_estate["sqrt_dist_to_mrt_m"] = np.sqrt(taiwan_real_estate["dist_to_mrt_m"])

sns.regplot(x="dist_to_mrt_m", y="price_twd_msq", data=taiwan_real_estate, ci=None)
plt.show()

mdl_price_vs_dist = ols("price_twd_msq ~ sqrt_dist_to_mrt_m", data=taiwan_real_estate).fit()
print(mdl_price_vs_dist.params)

explanatory_data = pd.DataFrame({"sqrt_dist_to_mrt_m": np.sqrt(np.arange(0, 81, 10) ** 2), "dist_to_mrt_m": np.arange(0, 81, 10) ** 2})
prediction_data = explanatory_data.assign(price_twd_msq=mdl_price_vs_dist.predict(explanatory_data))
print(prediction_data)

fig = plt.figure()
sns.regplot(x="sqrt_dist_to_mrt_m", y="price_twd_msq", data=taiwan_real_estate, ci=None)
sns.scatterplot(x="sqrt_dist_to_mrt_m", y="price_twd_msq", data=prediction_data, color="red")
plt.show()

# Quantifying Model Fit
mse = mdl_bream.mse_resid
print('mse: ', mse)

rse = np.sqrt(mse)
print("rse: ", rse)


##  linear regression model using the ols

In [None]:
# Import the ols function
from statsmodels.formula.api import ols

# Load your dataset into a DataFrame
# Replace 'taiwan_real_estate2.csv' with your actual file path or URL
taiwan_real_estate = pd.read_csv('taiwan_real_estate2.csv')

# Create the model object
mdl_price_vs_conv = ols('price_twd_msq ~ n_convenience', data=taiwan_real_estate)

# Fit the model
mdl_price_vs_conv = mdl_price_vs_conv.fit()

# Print the parameters of the fitted model
print(mdl_price_vs_conv.params)

## transformed variables and linear regressio

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols

# Load your dataset into a DataFrame
# Replace 'taiwan_real_estate2.csv' with your actual file path or URL
taiwan_real_estate = pd.read_csv('taiwan_real_estate2.csv')

# Create a scatter plot of 'dist_to_mrt_m' vs. 'price_twd_msq'
sns.regplot(x="dist_to_mrt_m", y="price_twd_msq", data=taiwan_real_estate, ci=None)
plt.show()

# Create a new column 'sqrt_dist_to_mrt_m' by taking the square root of 'dist_to_mrt_m'
taiwan_real_estate["sqrt_dist_to_mrt_m"] = np.sqrt(taiwan_real_estate["dist_to_mrt_m"])

# Create a scatter plot using the transformed variable 'sqrt_dist_to_mrt_m' vs. 'price_twd_msq'
plt.figure()
sns.regplot(x="sqrt_dist_to_mrt_m", y="price_twd_msq", data=taiwan_real_estate, ci=None)
plt.show()

# Run a linear regression of 'price_twd_msq' vs. the square root of 'dist_to_mrt_m' using taiwan_real_estate
mdl_price_vs_dist = ols("price_twd_msq ~ sqrt_dist_to_mrt_m", data=taiwan_real_estate).fit()

# Print the parameters of the fitted model
print(mdl_price_vs_dist.params)

# Create explanatory_data with different values of 'sqrt_dist_to_mrt_m'
explanatory_data = pd.DataFrame({"sqrt_dist_to_mrt_m": np.sqrt(np.arange(0, 81, 10) ** 2), "dist_to_mrt_m": np.arange(0, 81, 10) ** 2})

# Create prediction_data by adding a column of predictions to explanatory_data
prediction_data = explanatory_data.assign(price_twd_msq=mdl_price_vs_dist.predict(explanatory_data))

# Create a scatter plot of 'sqrt_dist_to_mrt_m' vs. 'price_twd_msq' with the fitted trendline and prediction points
fig = plt.figure()
sns.regplot(x="sqrt_dist_to_mrt_m", y="price_twd_msq", data=taiwan_real_estate, ci=None)
sns.scatterplot(x="sqrt_dist_to_mrt_m", y="price_twd_msq", data=prediction_data, color="red")
plt.show()

# Create another scatter plot of 'dist_to_mrt_m' vs. 'price_twd_msq' with the prediction points
fig = plt.figure()
sns.regplot(x="dist_to_mrt_m", y="price_twd_msq", data=taiwan_real_estate, ci=None)
sns.scatterplot(x="dist_to_mrt_m", y="price_twd_msq", data=prediction_data, color="red")

FileNotFoundError: ignored

# W9 - Machine Learning with Scikit-learn (1).pdf

## all from GPT

In [None]:
# Machine Learning with Scikit-learn

# Supervised Machine Learning

# What is machine learning?
# Machine learning is the process whereby:
# Computers are given the ability to learn to make decisions from data without being explicitly programmed.

# Unsupervised learning
# Uncovering hidden patterns from unlabeled data
# Example:
# Grouping customers into distinct categories (Clustering) Cluster Analysis for Customer Churn

# Supervised learning
# The predicted values are known
# Aim: Predict the target values of unseen data, given the features

# Features
# Target variable points_per_game assists_per_game rebounds_per_game steals_per_game blocks_per_game position
# 26.9 6.6 4.5 11 0.4 Point Guard
# 13 1.7 4 0.4 1.3 Center
# 17.6 2.3 7.9 1.0 0.8 Power Forward
# 22.6 4.5 4.4 1.2 0.4 Shooting Guard

# Types of supervised learning
# Classification: Target variable consists of categories
# Regression: Target variable is continuous

# Classifying labels of unseen data
# 1. Build a model
# 2. Model learns from the labeled data we pass to it
# 3. Pass unlabeled data to the model as input
# 4. Model predicts the labels of the unseen data
# Labeled data = training data

# k-Nearest Neighbors
# Predict the label of a data point by
# Looking at the k closest labeled data points
# Taking a majority vote

# To build intuition for KNN, let's look at this scatter plot displaying total evening charge against total day charge for customers of a telecom company.
# The observations are colored in blue for customers who have churned, and red for those who have not churned.
# Here we have visualized the results of a KNN algorithm where the number of neighbors is set to 15.
# KNN creates a decision boundary to predict if customers will churn.
# Any customers in the area with a gray background are predicted to churn, and those in the area with a red background are predicted to not churn.
# This boundary would be used to make predictions on unseen data.

# Using scikit-learn to fit a classifier
from sklearn.neighbors import KNeighborsClassifier
X = churn_df[["total_day_charge", "total_eve_charge"]].values
y = churn_df["churn"].values
print(X.shape, y.shape)
knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X, y)

# 1 corresponds to churn and 0 corresponds to no churn

# k-Nearest Neighbors: Fit
# In this exercise, you will build your first classification model using the churn_df dataset, can be loaded from churn_df.csv.
# The features to use will be "account_length" and "customer_service_calls".
# The target, "churn", needs to be a single column with the same number of observations as the feature data.
# You will convert the features and the target variable into NumPy arrays, create an instance of a KNN classifier, and then fit it to the data.
# Instructions
# • Import KNeighborsClassifier from sklearn.neighbors.
# • Create an array called X containing values from the "account_length" and "customer_service_calls" columns, and an array called y for the values of the "churn" column.
# • Instantiate a KNeighborsClassifier called knn with 6 neighbors.
# • Fit the classifier to the data using the .fit() method.
from sklearn.neighbors import KNeighborsClassifier
X = churn_df[["account_length", "customer_service_calls"]].values
y = churn_df["churn"].values
knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(X, y)

# k-Nearest Neighbors: Predict
# Now you have fit a KNN classifier, you can use it to predict the label of new data points.
# All available data was used for training, however, fortunately, there are new observations available, X_new.
# The model knn, which you created and fit the data in the last exercise, will be used.
# You will use your classifier to predict the labels of a set of new data points:
X_new = np.array([[30.0, 17.5], [107.0, 24.1], [213.0, 10.9]])
y_pred = knn.predict(X_new)
print(y_pred)

# Measuring model performance
# Computing accuracy
# Training set
# Split data
# Fit/train classifier on training set
# Calculate accuracy using test set
# Test set
# It is best practice to ensure our split reflects the proportion of labels in our data.
# So if churn occurs in 10% of observations, we want 10% of labels in our training and test sets to represent churn.
# We achieve this by setting stratify equal to y.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21, stratify=y)
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train, y_train)
print(knn.score(X_test, y_test))

# Decision boundaries, underfitting, and overfitting
# Recall that we discussed decision boundaries, which are thresholds for determining what label a model assigns to an observation.
# In the image shown, as k increases, the decision boundary is less affected by individual observations, reflecting a simpler model.
# Simpler models are less able to detect relationships in the dataset, which is known as underfitting.
# Complex models can be sensitive to noise in the training data, rather than reflecting general trends. This is known as overfitting.

# Variance and bias
train_accuracies = {}
test_accuracies = {}
neighbors = np.arange(1, 26)
for neighbor in neighbors:
    knn = KNeighborsClassifier(n_neighbors=neighbor)
    knn.fit(X_train, y_train)
    train_accuracies[neighbor] = knn.score(X_train, y_train)
    test_accuracies[neighbor] = knn.score(X_test, y_test)

my_train = list(train_accuracies.values())
my_test = list(test_accuracies.values())
plt.figure(figsize=(8, 6))
plt.title('KNN: Varing Number of Neighbors')
plt.plot(neighbors, my_train, label='Training Accuracy')
plt.plot(neighbors, my_test, label='Testing Accuracy')
plt.legend()
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.show()

# Here's the result! As k increases beyond 15 we see overfitting where performance plateaus on both test and training sets, as indicated in this plot. Peak occurs at around 16.

# Exercise
# NumPy arrays have been created for you containing the features as X and the target variable as y. You will split them into training and test sets, fit a KNN classifier to the training data, and then compute its accuracy on the test data using the .score() method.
# Instructions:
# • Import train_test_split from sklearn.model_selection.
# • Create Numpy arrays having all columns, except churn, as features X, and target variable, churn, as target variable y
# • Split X and y into training and test sets, setting test_size equal to 20%, random_state to 42, and ensuring the target label proportions reflect that of the original dataset.
# • Fit the knn model to the training data.
# • Compute and print the model's accuracy for the test data.
from sklearn.model_selection import train_test_split
X = churn_df.drop("churn", axis=1).values
y = churn_df["churn"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
accuracy = knn.score(X_test, y_test)
print(accuracy)

# Regression with Scikit-Learn

# Predicting blood glucose levels
import pandas as pd
diabetes_df = pd.read_csv("diabetes.csv")
X_bmi = diabetes_df["bmi"].values.reshape(-1, 1)
y = diabetes_df["glucose"].values

# Fitting a regression model
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_bmi, y)
predictions = reg.predict(X_bmi)

plt.scatter(X_bmi, y)
plt.plot(X_bmi, predictions)
plt.ylabel("Blood Glucose (mg/dl)")
plt.xlabel("Body Mass Index")
plt.show()

# Let’s practice
# Creating features
# In this exercise, you will work with a dataset called sales_df (loaded from sales_df.csv), which contains information on advertising campaign expenditure across different media types, and the number of dollars generated in sales for the respective campaign.
# The dataset has been preloaded for you. Here are the first two rows:
# tv 1 13000.0 2 41000.0
# radio 9237.76 15886.45
# social_media sales 2409.57 46677.90 2913.41 150177.83
# You will use the advertising expenditure as features to predict sales values, initially working with the "radio" column.
# However, before you make any predictions you will need to create the feature and target arrays, reshaping them to the correct format for scikit-learn.
# • Create X, an array of the values from the sales_df DataFrame's "radio" column.
# • Create y, an array of the values from the sales_df DataFrame's "sales" column.
# • Reshape X into a two-dimensional NumPy array.
# • Print the shape of X and y.
X = sales_df["radio"].values
y = sales_df["sales"].values
X = X.reshape(-1, 1)
print(X.shape, y.shape)

# Building a linear regression model
# Now you have created your feature and target arrays, you will train a linear regression model on all feature and target values.
# As the goal is to assess the relationship between the feature and target values there is no need to split the data into training and test sets.
# • Instantiate a linear regression model.
# • Predict sales values using X, storing as predictions.
# • Print five prediction values.
# Visualizing a linear regression model
# Now you have built your linear regression model and trained it using all available observations, you can visualize how well the model fits the data. This allows you to interpret the relationship between radio advertising expenditure and sales values.
# The variables X, an array of radio values, y, an array of sales values, and predictions, an array of the model's predicted values for y given X, have all been preloaded for you from the previous exercise.
# • Create a scatter plot visualizing y against X, with observations in blue.
# • Draw a red line plot displaying the predictions against X.
# • Display the plot.
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X, y)
predictions = reg.predict(X)

plt.scatter(X, y, color='blue')
plt.plot(X, predictions, color='red')
plt.xlabel('Radio Advertising Expenditure')
plt.ylabel('Sales')
plt.show()

# The loss function
# The distance is called a residual. We could try to minimize the sum of the residuals, each negative residual. To avoid this, we square the residuals. By adding all the squared residuals, we calculate the residual sum of squares, or RSS. This type of linear regression is called Ordinary Least Squares, or OLS, where we aim to minimize the RSS.
# Linear regression in higher dimensions y = a1x1 + a2x2 + a3x3 + ... + antn + b
# • To fit a linear regression model here:
# • Need to specify 3 variables: a1, a2, b
# • In higher dimensions:
# • Known as multiple regression
# • Must specify coefficients for each feature and the variable b
# y = a1x1 + a2x2 + a3x3 + ... + antn + b
# • scikit-learn works exactly the same way:
# • Pass two arrays: features and target

# Linear regression using all features
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LinearRegression
# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
# reg_all = LinearRegression()
# reg_all.fit(X_train, y_train)
# Y_pred = reg_all.predict(X_test)

# R-squared
# • R2: quantifies the variance in target values explained by the features
# • Values range from 0 to 1
# • Low R2: poor model fit
# • High R2: good model fit
# To compute R-squared, we call the model’s .score() method, passing the test features and targets. Here the features only explain about 35 percent of blood glucose level variance.
# The model has an average error for blood glucose levels of around 24 milligrams per deciliter.
# Fit and predict for regression
# Now you have seen how linear regression works, your task is to create a multiple linear regression model using all of the features in the sales_df dataset.
# As a reminder, here are the first two rows:
# You will then use this model to predict sales based on the values of the test features.
# • Create X, an array containing values of all features in sales_df, and y, containing all values from the "sales" column.
# • Instantiate a linear regression model.
# • Fit the model to the data.
# • Create y_pred, making predictions for sales using the test features.
# • Print the first two values of y_pred and y_test
# • Calculate the model's R-squared score by passing the test feature values and the test target values to an appropriate method.
# • Calculate the model's root mean squared error using y_test and y_pred.
# • Print r_squared and rmse.
from sklearn.linear_model import LinearRegression
X = sales_df.drop("sales", axis=1).values
y = sales_df["sales"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
reg = LinearRegression()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print(y_pred[:2])
print(y_test[:2])

from sklearn.metrics import mean_squared_error
from math import sqrt
mse = mean_squared_error(y_test, y_pred)
rmse = sqrt(mse)
print("R-squared:", reg.score(X_test, y_test))
print("RMSE:", rmse)

# Cross-validation for R-squared
# Cross-validation is a vital approach to evaluating a model. It maximizes the amount of data that is available to the model, as the model is not only trained but also tested on all of the available data.
# In this exercise, you will build a linear regression model, then use 6-fold cross-validation to assess its accuracy for predicting sales using social media advertising expenditure. You will display the individual score for each of the six-folds.
# The sales_df dataset shall be split into y for the target variable (sales), and X for the features (radio and social media).
# Instructions
# • Import KFold and cross_val_score.
# • Create X and y according to the requirements above.
# • Create kf by calling KFold(), setting the number of splits to six, shuffle to True, and setting a seed of 5.
# • Perform cross-validation using reg on X and y, passing kf to cv.
# • Print the cv_scores.
# • Calculate and print the mean of the cv_scores results.
# • Calculate and print the standard deviation of cv_scores.
# • Display the 95% confidence interval for your results using np.quantile().
from sklearn.model_selection import cross_val_score, KFold
kf = KFold(n_splits=6, shuffle=True, random_state=5)
cv_scores = cross_val_score(reg, X, y, cv=kf)
print("Cross-validation scores:", cv_scores)
print("Mean CV R-squared:", np.mean(cv_scores))
print("CV R-squared std:", np.std(cv_scores))
confidence_interval = np.quantile(cv_scores, [0.025, 0.975])
print("95% Confidence Interval:", confidence_interval)

# Regularized regression
# from sklearn.linear_model import Lasso
# diabetes_df = pd.read_csv('diabetes.csv', index_col = 0)
# diabetes_df = diabetes_df[diabetes_df['bmi'] != 0]
# diabetes_df = diabetes_df[diabetes_df['glucose'] != 0]
# X = diabetes_df.drop('glucose', axis=1).values
# y = diabetes_df['glucose'].values
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# scores = []
# for alpha in [0.01, 1.0, 10.0, 20.0, 50.0]:
#     lasso = Lasso(alpha=alpha)
#     lasso.fit(X_train, y_train)
#     lasso_pred = lasso.predict(X_test)
#     scores.append(lasso.score(X_test, y_test))
# print(scores)
from sklearn.linear_model import Lasso
diabetes_df = pd.read_csv('diabetes.csv', index_col=0)
diabetes_df = diabetes_df[diabetes_df['bmi'] != 0]
diabetes_df = diabetes_df[diabetes_df['glucose'] != 0]
X = diabetes_df.drop('glucose', axis=1).values
y = diabetes_df['glucose'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
scores = []
for alpha in [0.01, 1.0, 10.0, 20.0, 50.0]:
    lasso = Lasso(alpha=alpha)
    lasso.fit(X_train, y_train)
    lasso_pred = lasso.predict(X_test)
    scores.append(lasso.score(X_test, y_test))
print(scores)

# Lasso regression for feature importance
# Earlier, you saw how lasso regression can be used to identify important features in a dataset.
# In this exercise, you will fit a lasso regression model to the sales_df data and plot the model's coefficients.
# • The feature variables (all columns except sales) and target variable (sales) arrays have to be created as X and y, along with sales_columns, which contains the dataset's feature names.
# • Instantiate a Lasso regressor with an alpha of 0.1.
# • Fit the model to the data.
# • Compute the model's coefficients, storing as lasso_coef.
from sklearn.linear_model import Lasso
names = sales_df.drop("sales", axis=1).columns
lasso = Lasso(alpha=0.1)
lasso_coef = lasso.fit(X, y).coef_
plt.bar(names, lasso_coef)
plt.xticks(rotation=45)
plt.show()

# Lasso regression for feature importance
# Earlier, you saw how lasso regression can be used to identify important features in a dataset.
# In this exercise, you will fit a lasso regression model to the sales_df data and plot the model's coefficients.
# • The feature variables (all columns except sales) and target variable (sales) arrays have to be created as X and y, along with sales_columns, which contains the dataset's feature names.
# • Instantiate a Lasso regressor with an alpha of 0.1.
# • Fit the model to the data.
# • Compute the model's coefficients, storing as lasso_coef.
from sklearn.linear_model import Lasso
names = sales_df.drop("sales", axis=1).columns
lasso = Lasso(alpha=0.1)
lasso_coef = lasso.fit(X, y).coef_
plt.bar(names, lasso_coef)
plt.xticks(rotation=45)
plt.show()


## the k-Nearest Neighbors (KNN) classifier to predict labels

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Load your dataset into a DataFrame
# Replace 'taiwan_real_estate2.csv' with your actual file path or URL
churn_df = pd.read_csv('churn_df.csv')
X = churn_df[["account_length", "customer_service_calls"]].values
y = churn_df["churn"].values
# Assuming you have already prepared your data and loaded it into X and y

# Create a KNN classifier with a specified number of neighbors (e.g., 6)
knn = KNeighborsClassifier(n_neighbors=6)

# Fit the KNN classifier to your training data
knn.fit(X, y)

# Define new data points for prediction
X_new = np.array([[30.0, 17.5], [107.0, 24.1], [213.0, 10.9]])

# Use the trained KNN classifier to predict the labels for the new data points
y_pred = knn.predict(X_new)

# Print the predicted labels
print(y_pred)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# Load your data into a DataFrame (replace 'your_data.csv' with your actual data file)
data = pd.read_csv('your_data.csv')

# Assuming 'X' contains your feature columns and 'y' contains your target column
X = data[['feature1', 'feature2', ...]]  # Replace with actual feature names
y = data['target_column']  # Replace with actual target column name

# Split the data into training and testing sets with a 70-30 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21, stratify=y)

# Create a KNN classifier with a specified number of neighbors (e.g., 2)
knn = KNeighborsClassifier(n_neighbors=2)

# Fit the KNN classifier to the training data
knn.fit(X_train, y_train)

# Calculate and print the accuracy score on the test data
accuracy = knn.score(X_test, y_test)
print("Accuracy:", accuracy)

## numbers of neighbors, fit K-Nearest Neighbors (KNN) classifiers, and plot the training and testing accuracies as a function of the number of neighbors:


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# Load your data into a DataFrame (replace 'your_data.csv' with your actual data file)
data = pd.read_csv('your_data.csv')

# Assuming 'X' contains your feature columns and 'y' contains your target column
X = data[['feature1', 'feature2', ...]]  # Replace with actual feature names
y = data['target_column']  # Replace with actual target column name

# Split the data into training and testing sets with a 70-30 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21, stratify=y)

# Define a range of neighbors to evaluate
neighbors = np.arange(1, 26)

# Initialize dictionaries to store training and testing accuracies
train_accuracies = {}
test_accuracies = {}

# Loop through different numbers of neighbors
for neighbor in neighbors:
    knn = KNeighborsClassifier(n_neighbors=neighbor)
    knn.fit(X_train, y_train)

    # Calculate and store training accuracy
    train_accuracies[neighbor] = knn.score(X_train, y_train)

    # Calculate and store testing accuracy
    test_accuracies[neighbor] = knn.score(X_test, y_test)

# Convert accuracy values to lists for plotting
my_train = list(train_accuracies.values())
my_test = list(test_accuracies.values())

# Plot the training and testing accuracies
plt.figure(figsize=(8, 6))
plt.title('KNN: Varying Number of Neighbors')
plt.plot(neighbors, my_train, label='Training Accuracy')
plt.plot(neighbors, my_test, label='Testing Accuracy')
plt.legend()
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.show()

## regression model

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# Load the dataset
sales_df = pd.read_csv('sales_df.csv')

# Step 1: Create X and y
X = sales_df['radio'].values
y = sales_df['sales'].values

# Reshape X into a two-dimensional array
X = X.reshape(-1, 1)

# Print the shape of X and y
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

# Step 2: Build a linear regression model
# Instantiate a linear regression model
model = LinearRegression()

# Fit the model to the data
model.fit(X, y)

# Predict sales values
predictions = model.predict(X)

# Print the first five prediction values
print("First five predictions:", predictions[:5])

# Step 3: Visualize the linear regression model
# Create a scatter plot of y against X
plt.scatter(X, y, color='blue', label='Observations')

# Plot the predictions against X
plt.plot(X, predictions, color='red', label='Linear Regression')

# Add labels and legend
plt.xlabel('Radio Advertising Expenditure')
plt.ylabel('Sales')
plt.legend()

# Display the plot
plt.show()

## fitting and predicting with a multiple linear regression model:

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from math import sqrt

# Load the dataset
sales_df = pd.read_csv('sales_df.csv')

# Step 1: Create X and y
X = sales_df.drop("sales", axis=1).values
y = sales_df["sales"].values

# Step 2: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 3: Build a linear regression model
# Instantiate a linear regression model
model = LinearRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

# Step 4: Predict sales values using the test features
y_pred = model.predict(X_test)

# Print the first two values of y_pred and y_test
print("First two predictions:", y_pred[:2])
print("First two actual values:", y_test[:2])

# Step 5: Calculate the model's R-squared score
r_squared = r2_score(y_test, y_pred)

# Step 6: Calculate the model's root mean squared error
rmse = sqrt(mean_squared_error(y_test, y_pred))

# Print R-squared and RMSE
print("R-squared:", r_squared)
print("Root Mean Squared Error (RMSE):", rmse)


## performing 6-fold cross-validation with a linear regression model
and displaying individual scores, mean, standard deviation, and the 95% confidence interval:

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression

# Load the dataset
sales_df = pd.read_csv('sales_df.csv')

# Create X and y
X = sales_df[['radio', 'social_media']].values
y = sales_df['sales'].values

# Create KFold object with 6 splits, shuffle=True, and random seed 5
kf = KFold(n_splits=6, shuffle=True, random_state=5)

# Create a linear regression model
model = LinearRegression()

# Perform cross-validation
cv_scores = cross_val_score(model, X, y, cv=kf)

# Print individual scores
print("Individual CV Scores:", cv_scores)

# Calculate and print the mean of the CV scores
mean_score = np.mean(cv_scores)
print("Mean CV Score:", mean_score)

# Calculate and print the standard deviation of the CV scores
std_deviation = np.std(cv_scores)
print("Standard Deviation of CV Scores:", std_deviation)

# Calculate 95% confidence interval
lower_bound = np.quantile(cv_scores, 0.025)
upper_bound = np.quantile(cv_scores, 0.975)
print("95% Confidence Interval:", (lower_bound, upper_bound))


## Lasso regression and visualizing the coefficients.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso

# Load the diabetes dataset
diabetes_df = pd.read_csv('diabetes.csv', index_col=0)

# Filter out rows with zero BMI and glucose values
diabetes_df = diabetes_df[(diabetes_df['bmi'] != 0) & (diabetes_df['glucose'] != 0)]

# Create feature matrix X and target vector y
X = diabetes_df.drop('glucose', axis=1).values
y = diabetes_df['glucose'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Get the feature names
names = diabetes_df.drop('glucose', axis=1).columns

# Create a Lasso regression model with alpha = 0.1
lasso = Lasso(alpha=0.1)

# Fit the Lasso model on the data and get the coefficients
lasso_coef = lasso.fit(X, y).coef_

# Create a bar plot to visualize the coefficients
plt.bar(names, lasso_coef)
plt.xticks(rotation=45)
plt.xlabel("Features")
plt.ylabel("Coefficient Value")
plt.title("Lasso Coefficients")
plt.show()


# W10 - Machine Learning with Scikit-learn part 2.pdf

## All from chat

In [None]:
# Classification metrics
# Measuring model performance with accuracy: Fraction of correctly classified samples
# Not always a useful metric

# Class imbalance
# Classification for predicting fraudulent bank transactions
# 99% of transactions are legitimate; 1% are fraudulent
# Could build a classifier that predicts NONE of the transactions are fraudulent
# 99% accurate!
# But terrible at actually predicting fraudulent transactions
# Fails at its original purpose
# Class imbalance: Uneven frequency of classes
# Need a different way to assess performance

# The false negatives are the number of legitimate transactions incorrectly labeled
# The false positives are the number of transactions incorrectly labeled as fraudulent.
# The true negatives are the number of legitimate transactions correctly labeled
# The true positives are the number of fraudulent transactions correctly labeled

# Firstly, we can retrieve accuracy: it's the sum of true predictions divided by the total sum of the matrix.

# Usually, the class of interest is called the positive class. As we aim to detect fraud, the positive class is an illegitimate transaction. So why is the confusion matrix important? There are other important metrics we can calculate from the confusion matrix. Precision is the number of true positives divided by the sum of all positive predictions. It is also called the positive predictive value. In our case, this is the number of correctly labeled fraudulent transactions divided by the total number of transactions classified as fraudulent. High precision means having a lower false positive rate.

# Recall is the number of true positives divided by the sum of true positives and false negatives. This is also called sensitivity. High recall reflects a lower false negative rate. For our classifier, it means predicting most fraudulent transactions correctly.

# The F1-score is the harmonic mean of precision and recall. This metric gives equal weight to precision and recall, therefore it factors in both the number of errors made by the model and the type of errors. The F1 score favors models with similar precision and recall, and is a useful metric if we are seeking a model which performs reasonably well across both metrics.

# This report includes precision and recall by class (churn: 1 or no churn: 0), point-seven-six and point-one-six for the churn class respectively, which highlights how poorly the model's recall is on the churn class. Support represents the number of instances for each class within the true labels. (Class imbalance)

# Hyperparameter tuning
# Lasso regression: choosing alpha
# KNN: choosing n_neighbors
# Hyperparameters: parameters we specify before fitting the model

# We perform k-fold cross-validation for each combination of hyperparameters. The mean scores for each combination are shown here. By default, KNN uses Euclidean distance but Manhattan distance can be selected by setting p = 1.
# For example, .KNeighborsClassifier(n_neighbors=5, p=1)

# Preprocessing Data

# We will be working with a music dataset in this chapter, for both classification and regression problems. Initially, we will build a regression model using all features in the dataset to predict song popularity. There is one categorical feature, genre, with ten possible values.

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression, LogisticRegression
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

music_df = pd.read_csv('music.csv', index_col=0)
music_dummies = pd.get_dummies(music_df['genre'], drop_first=True)
music_dummies = pd.concat([music_df, music_dummies], axis=1)
music_dummies = music_dummies.drop('genre', axis=1)

X = music_dummies.drop('popularity', axis=1).values
y = music_dummies['popularity'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
linreg = LinearRegression()
linreg_cv = cross_val_score(linreg, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')
linreg_cv2 = cross_val_score(linreg, X_train, y_train, cv=kf)
print(np.sqrt(-linreg_cv))
print(linreg_cv2)

music_df = pd.read_csv('music_unclean.csv', index_col=0)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(knn.score(X_test, y_test))

music_df = pd.read_csv('music_unclean.csv', index_col=0)
music_df = music_df.dropna(subset=['genre', 'popularity', 'loudness', 'liveness', 'tempo'])
music_df['genre'] = np.where(music_df['genre'] == 'Rock', 1, 0)

X = music_df.drop('genre', axis=1).values
y = music_df['genre'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

steps = [('imputation', SimpleImputer()), ('Log_reg', LogisticRegression())]
pipeline = Pipeline(steps)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(pipeline.score(X_test, y_test))


## splits it into training and testing sets, performs cross-validation with linear regression, and prints the results

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression

# Load the music dataset
music_df = pd.read_csv('music.csv', index_col=0)

# Create dummy variables for the 'genre' column
music_dummies = pd.get_dummies(music_df['genre'], drop_first=True)

# Combine the original dataset with dummy variables
music_dummies = pd.concat([music_df, music_dummies], axis=1)

# Drop the original 'genre' column
music_dummies = music_dummies.drop('genre', axis=1)

# Print the columns of the resulting dataframe
print(music_dummies.columns)

# Create feature matrix X and target vector y
X = music_dummies.drop('popularity', axis=1).values
y = music_dummies['popularity'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize KFold with 5 splits and shuffle
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Create a Linear Regression model
linreg = LinearRegression()

# Perform cross-validation for negative mean squared error (MSE)
linreg_cv = cross_val_score(linreg, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

# Perform cross-validation without specifying a scoring method
linreg_cv2 = cross_val_score(linreg, X_train, y_train, cv=kf)

# Calculate and print the RMSE from the negative MSE values
print(np.sqrt(-linreg_cv))

# Print cross-validation scores without specifying a scoring method
print(linreg_cv2)


## preprocesses the music dataset, handles missing values, encodes the 'genre' column, and uses a pipeline to perform imputation and classification with k-nearest neighbors:

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix

# Load the uncleaned music dataset
music_df = pd.read_csv("music_unclean.csv")

# Print the number of missing values in each column (ascending order)
print(music_df.isna().sum().sort_values(ascending=True))

# Drop rows with missing values in specific columns
music_df = music_df.dropna(subset=['genre', 'popularity', 'loudness', 'liveness', 'tempo'])

# Encode 'genre' column as binary (Rock vs. non-Rock)
music_df['genre'] = np.where(music_df['genre'] == 'Rock', 1, 0)

# Create feature matrix X and target vector y
X = music_df.drop('genre', axis=1).values
y = music_df['genre'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize SimpleImputer for missing value imputation
imputer = SimpleImputer()

# Initialize KNeighborsClassifier with k=3
knn = KNeighborsClassifier(n_neighbors=3)

# Create a pipeline with imputer and KNeighborsClassifier
steps = [("imputer", imputer), ("knn", knn)]
pipeline = Pipeline(steps)

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test data
y_pred = pipeline.predict(X_test)

# Print confusion matrix and accuracy score
print(confusion_matrix(y_test, y_pred))
print(pipeline.score(X_test, y_test))

## preprocesses the music dataset, handles missing values, encodes the 'genre' column, and uses k-nearest neighbors (KNN) for classification without a pipeline:

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier

# Load the uncleaned music dataset
music_df = pd.read_csv("music_unclean.csv")

# Print the number of missing values in each column (ascending order)
print(music_df.isna().sum().sort_values(ascending=True))

# Drop rows with missing values in specific columns
music_df = music_df.dropna(subset=['genre', 'popularity', 'loudness', 'liveness', 'tempo'])

# Encode 'genre' column as binary (Rock vs. non-Rock)
music_df['genre'] = np.where(music_df['genre'] == 'Rock', 1, 0)

# Create feature matrix X and target vector y
X = music_df.drop('genre', axis=1).values
y = music_df['genre'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize SimpleImputer for missing value imputation with strategy 'mean'
imp = SimpleImputer(strategy='mean')
X_train = imp.fit_transform(X_train)
X_test = imp.transform(X_test)

# Define the feature column names (excluding 'genre')
columns = ['acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']

# Create a DataFrame with imputed values and column names
check = pd.DataFrame(X_train, columns=columns)

# Initialize KNeighborsClassifier with k=5
knn = KNeighborsClassifier(n_neighbors=5)

# Fit the KNN model on the training data
knn.fit(X_train, y_train)

# Make predictions on the test data
y_pred = knn.predict(X_test)

# Calculate and print the accuracy score
accuracy = knn.score(X_test, y_test)
print(accuracy)

upper syntax seen be error

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier

# Load the uncleaned music dataset
music_df = pd.read_csv("music_unclean.csv")

# Print the number of missing values in each column (ascending order)
print(music_df.isna().sum().sort_values(ascending=True))

# Drop rows with missing values in specific columns
music_df = music_df.dropna(subset=['genre', 'popularity', 'loudness', 'liveness', 'tempo'])

# Encode 'genre' column as binary (Rock vs. non-Rock)
music_df['genre'] = np.where(music_df['genre'] == 'Rock', 1, 0)

# Create feature matrix X and target vector y
X = music_df.drop('genre', axis=1).values
y = music_df['genre'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize SimpleImputer for missing value imputation with strategy 'mean'
imp = SimpleImputer(strategy='mean')
X_train = imp.fit_transform(X_train)
X_test = imp.transform(X_test)

# Define the feature column names (excluding 'genre')
columns = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']

# Initialize KNeighborsClassifier with k=5
knn = KNeighborsClassifier(n_neighbors=5)

# Fit the KNN model on the training data
knn.fit(X_train, y_train)

# Make predictions on the test data
y_pred = knn.predict(X_test)

# Calculate and print the accuracy score
accuracy = knn.score(X_test, y_test)
print(accuracy)

FileNotFoundError: ignored

# W11 - Credit card approval predictor.PDF

## Supervised Machine learning by LogisticRegression

In [None]:
# Import pandas
import pandas as pd

# Read the credit card approvals dataset
cc_apps = pd.read_csv("cc_approvals.data", header=None)
cc_apps.head()

# Display a summary of the dataset
print(cc_apps.describe())
print('\n')
print(cc_apps.info())
print('\n')

# Display the last 17 rows of the dataset
cc_apps.tail(17)  # or cc_apps.sample()

from sklearn.model_selection import train_test_split

# Calculate the correlation matrix
print(cc_apps.corr())

# Drop the features 11 and 13
cc_apps = cc_apps.drop([11, 13], axis=1)

# Split the dataset into train and test sets
cc_apps_train, cc_apps_test = train_test_split(cc_apps, test_size=0.33, random_state=42)

# Import numpy
import numpy as np

# Replace the '?'s with NaN in the train and test sets
cc_apps_train = cc_apps_train.replace('?', np.NaN)
cc_apps_test = cc_apps_test.replace('?', np.NaN)

# Impute the missing values with mean imputation
cc_apps_train.fillna(cc_apps_train.mean(), inplace=True)
cc_apps_test.fillna(cc_apps_train.mean(), inplace=True)

# Count the number of NaNs in the datasets and print the counts to verify
print(cc_apps_train.isnull().sum())
print(cc_apps_test.isnull().sum())

for col in cc_apps_train.columns:
    # Iterate over each column of cc_apps_train
    if cc_apps_train[col].dtypes == 'object':
        # Check if the column is of object type
        # Impute with the most frequent value
        # The value_counts() function returns a Series that contains counts of unique values.
        # It returns an object that will be in descending order, so the first element will be the most frequently-occurred element.
        cc_apps_train = cc_apps_train.fillna(cc_apps_train[col].value_counts().index[0])
        cc_apps_test = cc_apps_test.fillna(cc_apps_train[col].value_counts().index[0])

# Count the number of NaNs in the dataset and print the counts to verify
print(cc_apps_train.isnull().sum())
print(cc_apps_test.isnull().sum())

# At this point, there are no missing values.

# Convert the categorical features in the train and test sets independently
print(cc_apps_train)
cc_apps_train = pd.get_dummies(cc_apps_train)
cc_apps_test = pd.get_dummies(cc_apps_test)
print(cc_apps_train)

# Reindex the columns of the test set aligning with the train set
cc_apps_test = cc_apps_test.reindex(columns=cc_apps_train.columns, fill_value=0)

# Import MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

# Segregate features and labels into separate variables
# .ravel() ensure that y_train and y_test are 1D arrays as expected by the Logistic Regression model.
X_train, y_train = cc_apps_train.iloc[:, :-1].values, cc_apps_train.iloc[:, [-1]].values.ravel()
X_test, y_test = cc_apps_test.iloc[:, :-1].values, cc_apps_test.iloc[:, [-1]].values.ravel()

# Instantiate MinMaxScaler and use it to rescale X_train and X_test
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.transform(X_test)

# Import LogisticRegression
from sklearn.linear_model import LogisticRegression

# Instantiate a LogisticRegression classifier with default parameter values
logreg = LogisticRegression()

# Fit logreg to the train set
logreg.fit(rescaledX_train, y_train)

# Import confusion_matrix
from sklearn.metrics import confusion_matrix

# Use logreg to predict instances from the test set and store it
y_pred = logreg.predict(rescaledX_test)

# Get the accuracy score of the logreg model and print it
print("Accuracy of logistic regression classifier: ", logreg.score(rescaledX_test, y_test))

# Print the confusion matrix of the logreg model
confusion_matrix(y_test, y_pred)

## KNN

In [None]:
# Import pandas and other libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix

# Read the credit card approvals dataset
cc_apps = pd.read_csv("cc_approvals.data", header=None)

# Display a summary of the dataset
print(cc_apps.describe())
print('\n')
print(cc_apps.info())
print('\n')

# Drop the features 11 and 13
cc_apps = cc_apps.drop([11, 13], axis=1)

# Split the dataset into train and test sets
cc_apps_train, cc_apps_test = train_test_split(cc_apps, test_size=0.33, random_state=42)

# Replace the '?'s with NaN in the train and test sets
cc_apps_train = cc_apps_train.replace('?', np.NaN)
cc_apps_test = cc_apps_test.replace('?', np.NaN)

# Impute the missing values with mean imputation
cc_apps_train.fillna(cc_apps_train.mean(), inplace=True)
cc_apps_test.fillna(cc_apps_train.mean(), inplace=True)

# Convert the categorical features in the train and test sets independently
cc_apps_train = pd.get_dummies(cc_apps_train)
cc_apps_test = pd.get_dummies(cc_apps_test)

# Reindex the columns of the test set aligning with the train set
cc_apps_test = cc_apps_test.reindex(columns=cc_apps_train.columns, fill_value=0)

# Segregate features and labels into separate variables
X_train, y_train = cc_apps_train.iloc[:, :-1].values, cc_apps_train.iloc[:, [-1]].values.ravel()
X_test, y_test = cc_apps_test.iloc[:, :-1].values, cc_apps_test.iloc[:, [-1]].values.ravel()

# Instantiate MinMaxScaler and use it to rescale X_train and X_test
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.transform(X_test)

# Find the best 'k' for KNN using cross-validation
best_k = None
best_accuracy = 0

for k in range(1, 21):  # You can adjust the range of k as needed
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, rescaledX_train, y_train, cv=5)
    mean_accuracy = scores.mean()

    if mean_accuracy > best_accuracy:
        best_accuracy = mean_accuracy
        best_k = k

# Train KNN with the best 'k' on the entire training set
best_knn = KNeighborsClassifier(n_neighbors=best_k)
best_knn.fit(rescaledX_train, y_train)

# Use the trained KNN to make predictions on the test set
y_pred = best_knn.predict(rescaledX_test)

# Get the accuracy score of the KNN model and print it
accuracy = best_knn.score(rescaledX_test, y_test)
print(f"Accuracy of KNN classifier with k={best_k}: {accuracy}")

# Print the confusion matrix of the KNN model
confusion = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(confusion)

data is split into train and test sets before imputing and preprocessing.
Your task here is to
## Inverse these steps by performing cleaning, imputing and preprocessing before splitting the data into train and test sets.
 Use a Logistic Regression and evaluate the model performance.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

# Read the credit card approvals dataset
cc_apps = pd.read_csv("cc_approvals.data", header=None)

# Display a summary of the dataset
print(cc_apps.describe())
print('\n')
print(cc_apps.info())
print('\n')

# Drop the features 11 and 13
cc_apps = cc_apps.drop([11, 13], axis=1)

# Replace the '?'s with NaN
cc_apps = cc_apps.replace('?', np.NaN)

# Impute the missing values with mean imputation
cc_apps.fillna(cc_apps.mean(), inplace=True)

# Convert the categorical features
cc_apps = pd.get_dummies(cc_apps)

# Reindex the columns of the dataset to align with the train set
cc_apps = cc_apps.reindex(columns=cc_apps.columns, fill_value=0)

# Segregate features and labels into separate variables
X = cc_apps.iloc[:, :-1].values
y = cc_apps.iloc[:, -1].values

# Instantiate MinMaxScaler and use it to rescale X
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(rescaledX, y, test_size=0.33, random_state=42)

# Instantiate a LogisticRegression classifier with default parameter values
logreg = LogisticRegression()

# Fit logreg to the train set
logreg.fit(X_train, y_train)

# Use logreg to predict instances from the test set
y_pred = logreg.predict(X_test)

# Get the accuracy score of the logreg model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of logistic regression classifier: {accuracy}")

# Print the confusion matrix of the logreg model
confusion = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(confusion)