In [None]:
#Logistic Regression 

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Replace 'data.csv' with your actual CSV file path
df = pd.read_csv('data.csv')

# Feature Engineering
# Create a binary target variable: 1 if Profit is above average, 0 otherwise
average_profit = df['Profit'].mean()
df['Target'] = (df['Profit'] > average_profit).astype(int)

# Data Preprocessing
# Separating the feature variables (X) and the target variable (y)
X = df.drop(['Profit', 'Target'], axis=1)
y = df['Target']

# Handling missing values and converting categorical data
# Identifying categorical columns with dtype 'object'
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
# Identifying numerical columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Create transformers for numerical and categorical data
numerical_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Model Training
# Create a pipeline that first preprocesses the data then applies the classifier
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))  # Increase max_iter if convergence warning
])

model.fit(X_train, y_train)

# Evaluation
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


In [None]:
#decision trees
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Replace 'data.csv' with your actual CSV file path
df = pd.read_csv('data.csv')

# Data Preprocessing
# Identifying categorical columns with dtype 'object'
categorical_features = df.select_dtypes(include=['object']).columns.tolist()
# Identifying numerical columns
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Handling missing values
# Create transformers for numerical and categorical data
numerical_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Separating the feature variables (X) and the target variable (y)
# Assuming the target variable is named 'Target'
X = df.drop('Target', axis=1)
y = df['Target']

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Model Training
# Create a pipeline that first preprocesses the data then applies the classifier
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=0))
])

model.fit(X_train, y_train)

# Evaluation
y_pred = model.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}')
print(f'Classification Report:\n{classification_report(y_test, y_pred)}')


In [None]:
#Random Forest
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Replace 'data.csv' with your actual CSV file path
df = pd.read_csv('data.csv')

# Data Preprocessing
# Separating the feature variables (X) and the target variable (y)
# Assuming 'Target' is the name of the target variable
X = df.drop('Target', axis=1)
y = df['Target']

# Identifying categorical columns with dtype 'object'
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
# Identifying numerical columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Handling missing values
# Create transformers for numerical and categorical data
numerical_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Model Training
# Create a pipeline that first preprocesses the data then applies the classifier
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=0))
])

model.fit(X_train, y_train)

# Evaluation
y_pred = model.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}')
print(f'Classification Report:\n{classification_report(y_test, y_pred)}')


In [None]:
#Support Vector Machines
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Replace 'data.csv' with your actual CSV file path
df = pd.read_csv('data.csv')

# Data Preprocessing
# Separating the feature variables (X) and the target variable (y)
# Assuming 'Target' is the name of the target variable
X = df.drop('Target', axis=1)
y = df['Target']

# Identifying categorical columns with dtype 'object'
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
# Identifying numerical columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Handling missing values and scaling numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())  # SVMs assume that data is scaled
])

# Converting categorical variables using one-hot encoding
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Model Training
# Create a pipeline that preprocesses the data and then applies the classifier
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC(kernel='linear'))  # You can choose other kernels like 'rbf'
])

model.fit(X_train, y_train)

# Evaluation
y_pred = model.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}')
print(f'Classification Report:\n{classification_report(y_test, y_pred)}')


In [None]:
#K-Nearest Neighbor
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Replace 'data.csv' with your actual CSV file path
df = pd.read_csv('data.csv')

# Data Preprocessing
# Separating the feature variables (X) and the target variable (y)
# Assuming 'Target' is the name of the target variable
X = df.drop('Target', axis=1)
y = df['Target']

# Identifying categorical columns with dtype 'object'
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
# Identifying numerical columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Feature Scaling is crucial for KNN as it is a distance-based algorithm
# Scaling numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Converting categorical variables using one-hot encoding
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Model Training
# Create a pipeline that preprocesses the data and then applies the classifier
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier(n_neighbors=5))  # n_neighbors can be tuned
])

model.fit(X_train, y_train)

# Evaluation
y_pred = model.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}')
print(f'Classification Report:\n{classification_report(y_test, y_pred)}')


In [None]:
#Linear Regression
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Replace 'data.csv' with your actual CSV file path
df = pd.read_csv('data.csv')

# Data Preprocessing
# Assuming 'Target' is the name of the target variable and the rest are features
X = df.drop('Target', axis=1)
y = df['Target']

# Identifying categorical columns with dtype 'object'
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
# Identifying numerical columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Handling missing values and scaling numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())  # Optional: scale features if necessary
])

# Converting categorical variables using one-hot encoding
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Model Training
# Create a pipeline that preprocesses the data and then applies the regressor
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

model.fit(X_train, y_train)

# Evaluation
y_pred = model.predict(X_test)
print(f'Mean Absolute Error (MAE): {mean_absolute_error(y_test, y_pred)}')
print(f'Mean Squared Error (MSE): {mean_squared_error(y_test, y_pred)}')
print(f'R-squared (R^2): {r2_score(y_test, y_pred)}')


In [None]:
#Polynomial Regression
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Replace 'data.csv' with your actual CSV file path
df = pd.read_csv('data.csv')

# Feature Engineering
# Selecting variables for which we want to create polynomial features
# For example, let's assume we are interested in 'Feature1' and 'Feature2' for polynomial features
selected_features = ['Feature1', 'Feature2']

# Generating polynomial and interaction features
# Degree of polynomial can be adjusted
poly = PolynomialFeatures(degree=2, include_bias=False)

# Data Preprocessing
# Assuming 'Target' is the name of the target variable
X = df.drop('Target', axis=1)
y = df['Target']

# Identifying categorical columns with dtype 'object'
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
# Identifying numerical columns excluding those selected for polynomial features
numerical_features = [col for col in X.select_dtypes(include=['int64', 'float64']).columns.tolist() if col not in selected_features]

# Handling missing values and scaling numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Converting categorical variables using one-hot encoding
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Model Training
# Create a pipeline that preprocesses the data, generates polynomial features, and then applies the regressor
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('poly_features', PolynomialFeatures(degree=2, include_bias=False)),
    ('regressor', LinearRegression())
])

model.fit(X_train, y_train)

# Evaluation
y_pred = model.predict(X_test)
print(f'Mean Absolute Error (MAE): {mean_absolute_error(y_test, y_pred)}')
print(f'Mean Squared Error (MSE): {mean_squared_error(y_test, y_pred)}')
print(f'R-squared (R^2): {r2_score(y_test, y_pred)}')


In [None]:
#Ridge and Lasso Regression 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score

# Replace 'data.csv' with your actual CSV file path
df = pd.read_csv('data.csv')

# Data Preprocessing
# Assuming 'Target' is the name of the target variable and the rest are features
X = df.drop('Target', axis=1)
y = df['Target']

# Identifying categorical columns with dtype 'object'
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
# Identifying numerical columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Handling missing values and scaling numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())  # It's good practice to scale features for regularization
])

# Converting categorical variables using one-hot encoding
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Model Training: Ridge Regression
ridge_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge(alpha=1.0))  # Alpha can be tuned
])
ridge_model.fit(X_train, y_train)

# Model Training: Lasso Regression
lasso_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Lasso(alpha=0.1))  # Alpha can be tuned
])
lasso_model.fit(X_train, y_train)

# Evaluation: Ridge Regression
ridge_pred = ridge_model.predict(X_test)
print("Ridge Regression")
print(f'Mean Squared Error (MSE): {mean_squared_error(y_test, ridge_pred)}')
print(f'R-squared (R^2): {r2_score(y_test, ridge_pred)}')

# Evaluation: Lasso Regression
lasso_pred = lasso_model.predict(X_test)
print("\nLasso Regression")
print(f'Mean Squared Error (MSE): {mean_squared_error(y_test, lasso_pred)}')
print(f'R-squared (R^2): {r2_score(y_test, lasso_pred)}')

# Observe the coefficients
ridge_coefs = ridge_model.named_steps['regressor'].coef_
lasso_coefs = lasso_model.named_steps['regressor'].coef_

print("\nRidge coefficients:", ridge_coefs)
print("Lasso coefficients:", lasso_coefs)

# Discuss the effect of regularization
print("\nEffect of Regularization:")
print("Ridge regression minimizes coefficient size squared (L2 penalty).")
print("Lasso regression minimizes coefficient size (L1 penalty) and can set some coefficients to zero, thus performing feature selection.")


In [None]:
#K-Means Clustering
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Replace 'data.csv' with your actual CSV file path
df = pd.read_csv('data.csv')

# Data Preprocessing
# Handling missing values
imputer = SimpleImputer(strategy='mean')
df_filled = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# Normalizing numerical features
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_filled), columns=df_filled.columns)

# Clustering with KMeans
# Trying with a range of clusters to find the optimal one
inertia = []
cluster_options = [2, 3, 4, 5, 6]  # Replace with the range of clusters you want to try
for n_clusters in cluster_options:
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    kmeans.fit(df_scaled)
    inertia.append(kmeans.inertia_)

# Plot the inertia to see which number of clusters is best
plt.plot(cluster_options, inertia, '-o')
plt.xlabel('Number of clusters, k')
plt.ylabel('Inertia')
plt.title('Inertia of k-Means versus number of clusters')
plt.show()

# Let's assume the optimal number of clusters is 3 for this example
optimal_clusters = 3
kmeans = KMeans(n_clusters=optimal_clusters, random_state=0)
df_scaled['Cluster'] = kmeans.fit_predict(df_scaled)

# Visualization & Interpretation
# Reduce the dimension to 2D using PCA and visualize the clusters
pca = PCA(n_components=2)
df_pca = pca.fit_transform(df_scaled.drop('Cluster', axis=1))

plt.figure(figsize=(8, 6))
plt.scatter(df_pca[:, 0], df_pca[:, 1], c=df_scaled['Cluster'], cmap='viridis', marker='o', edgecolor='k', s=50)
plt.title('2D Visualization of Clusters')
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.colorbar(label='Cluster')
plt.show()

# Analyzing and interpreting the characteristics of each cluster
# Describe each cluster
for i in range(optimal_clusters):
    cluster_data = df[df_scaled['Cluster'] == i]
    print(f"\nCluster {i}:")
    print(cluster_data.describe().transpose())



In [None]:
#Hierarchial Clustering
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from scipy.cluster.hierarchy import dendrogram
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage

# Replace 'data.csv' with your actual CSV file path
df = pd.read_csv('data.csv')

# Data Preprocessing
# Identifying categorical columns with dtype 'object'
categorical_features = df.select_dtypes(include=['object']).columns.tolist()
# Identifying numerical columns
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Handling missing values and scaling numerical features
# Converting categorical variables using one-hot encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

df_processed = preprocessor.fit_transform(df)
df_processed = pd.DataFrame(df_processed)

# Clustering
# Use the linkage method to perform the hierarchical clustering
linked = linkage(df_processed, method='ward')

# Visualization: Displaying the dendrogram
plt.figure(figsize=(10, 7))
dendrogram(linked,
           orientation='top',
           labels=np.array(df.index),
           distance_sort='descending',
           show_leaf_counts=True)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample Index')
plt.ylabel('Distance')
plt.show()

# Interpreting the dendrogram
print("Interpret the dendrogram by observing the height at which any two clusters merge. Larger heights indicate that clusters are more distinct.")


In [None]:
#DBSCAN
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt

# Replace 'data.csv' with your actual CSV file path
df = pd.read_csv('data.csv')

# Assuming all features are numerical. If there are categorical features,
# consider encoding them or excluding them from clustering.
# Normalizing the features
scaler = StandardScaler()
df_normalized = scaler.fit_transform(df)

# Clustering with DBSCAN
# Experiment with different values of epsilon and min_samples
dbscan = DBSCAN(eps=0.5, min_samples=5)
clusters = dbscan.fit_predict(df_normalized)

# Visualization & Interpretation
# Plotting the clusters and noise points
plt.figure(figsize=(10, 6))
# Clusters are marked with different colors, noise points are marked with black
unique_labels = set(clusters)
colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = [0, 0, 0, 1]
    
    class_member_mask = (clusters == k)
    
    xy = df_normalized[class_member_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), markeredgecolor='k', markersize=14 if k == -1 else 6)

plt.title('DBSCAN Clustering')
plt.xlabel('Feature 1 (scaled)')
plt.ylabel('Feature 2 (scaled)')
plt.show()

# Discuss the characteristics of the identified dense regions
print("Clusters identified by DBSCAN are highlighted, and noise points are marked in black.")
print("Each cluster represents a dense region of data points surrounded by a region of lower density.")
print("Points not assigned to any cluster, and thus considered noise, are those in sparse areas.")
