<a href="https://colab.research.google.com/github/sanadv/MLCourse/blob/main/Classification_Solutions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, log_loss

# Load the data
df = pd.read_csv('ds3.csv')

# Convert categorical variables to numeric variables

#Converting Categorical Variables to Numeric: Many machine learning models, including XGBClassifier, cannot handle categorical variables directly. The code identifies all columns of type 'object' in the DataFrame (assumed to be categorical) and converts them into numeric values using LabelEncoder. This is done by mapping each unique category to a unique integer.


le = LabelEncoder()
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].apply(lambda col: le.fit_transform(col))

# Separate features and target
#specifying axis=1 refers to the operation being performed along the columns. This contrasts with axis=0, which would mean the operation is performed along the rows.
X = df.drop('Response', axis=1)
y = df['Response']

# Split data into training, validation, and test sets
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)

# Initialize and fit the model on the training set
model = XGBClassifier(use_label_encoder=False)
model.fit(X_train, y_train)

# Make predictions on the training set and calculate loss
y_pred_train = model.predict(X_train)
y_pred_train_proba = model.predict_proba(X_train)
train_accuracy = accuracy_score(y_train, y_pred_train)
train_loss = log_loss(y_train, y_pred_train_proba)

# Make predictions on the validation set and calculate loss
y_pred_val = model.predict(X_val)
y_pred_val_proba = model.predict_proba(X_val)
val_accuracy = accuracy_score(y_val, y_pred_val)
val_loss = log_loss(y_val, y_pred_val_proba)

# Print training and validation loss and accuracy
print("Training Loss: %.2f" % train_loss)
print("Training Accuracy: %.2f%%" % (train_accuracy * 100.0))
print("Validation Loss: %.2f" % val_loss)
print("Validation Accuracy: %.2f%%" % (val_accuracy * 100.0))

# Print precision, recall, and F1 score for the validation set
print(classification_report(y_val, y_pred_val))


ModuleNotFoundError: No module named 'xgboost'

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, log_loss

# Load the data
df = pd.read_csv('ds3.csv')

# Convert categorical variables to numeric variables
le = LabelEncoder()
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].apply(lambda col: le.fit_transform(col))

# Separate features and target
X = df.drop('Response', axis=1)
y = df['Response']

# Split data into training, validation, and test sets
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)

# Initialize and fit the model on the training set
model = GaussianNB()
model.fit(X_train, y_train)

# Make predictions on the training set and calculate loss
y_pred_train = model.predict(X_train)
y_pred_train_proba = model.predict_proba(X_train)
train_accuracy = accuracy_score(y_train, y_pred_train)
train_loss = log_loss(y_train, y_pred_train_proba)

# Make predictions on the validation set and calculate loss
y_pred_val = model.predict(X_val)
y_pred_val_proba = model.predict_proba(X_val)
val_accuracy = accuracy_score(y_val, y_pred_val)
val_loss = log_loss(y_val, y_pred_val_proba)

# Print training and validation loss and accuracy
print("Training Loss: %.2f" % train_loss)
print("Training Accuracy: %.2f%%" % (train_accuracy * 100.0))
print("Validation Loss: %.2f" % val_loss)
print("Validation Accuracy: %.2f%%" % (val_accuracy * 100.0))

# Print precision, recall, and F1 score for the validation set
print(classification_report(y_val, y_pred_val))


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod.families import Binomial
from sklearn.metrics import accuracy_score, classification_report, log_loss
import numpy as np

# Load the data
df = pd.read_csv('ds3.csv')

# Convert categorical variables to numeric variables
le = LabelEncoder()
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].apply(lambda col: le.fit_transform(col))

# Separate features and target
X = df.drop('Response', axis=1)
y = df['Response']

# Split data into training, validation, and test sets
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)

# Initialize and fit the model on the training set
model = GLM(y_train, X_train, family=Binomial())
result = model.fit()

# Make predictions on the training set and calculate loss
y_pred_train = result.predict(X_train)
y_pred_train_proba = np.column_stack((1-y_pred_train, y_pred_train))
train_accuracy = accuracy_score(y_train, y_pred_train.round())
train_loss = log_loss(y_train, y_pred_train_proba)

# Make predictions on the validation set and calculate loss
y_pred_val = result.predict(X_val)
y_pred_val_proba = np.column_stack((1-y_pred_val, y_pred_val))
val_accuracy = accuracy_score(y_val, y_pred_val.round())
val_loss = log_loss(y_val, y_pred_val_proba)

# Print training and validation loss and accuracy
print("Training Loss: %.2f" % train_loss)
print("Training Accuracy: %.2f%%" % (train_accuracy * 100.0))
print("Validation Loss: %.2f" % val_loss)
print("Validation Accuracy: %.2f%%" % (val_accuracy * 100.0))

# Print precision, recall, and F1 score for the validation set
print(classification_report(y_val, y_pred_val.round()))


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, log_loss

# Load the data
df = pd.read_csv('ds3.csv')

# Convert categorical variables to numeric variables
le = LabelEncoder()
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].apply(lambda col: le.fit_transform(col))

# Separate features and target
X = df.drop('Response', axis=1)
y = df['Response']

# Split data into training, validation, and test sets
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)

# Initialize and fit the model on the training set
model1 = LogisticRegression(max_iter=1000)
model1.fit(X_train, y_train)

# Make predictions on the training set and calculate loss
y_pred_train = model1.predict(X_train)
y_pred_train_proba = model1.predict_proba(X_train)
train_accuracy = accuracy_score(y_train, y_pred_train)
train_loss = log_loss(y_train, y_pred_train_proba)

# Make predictions on the validation set and calculate loss
y_pred_val = model1.predict(X_val)
y_pred_val_proba = model1.predict_proba(X_val)
val_accuracy = accuracy_score(y_val, y_pred_val)
val_loss = log_loss(y_val, y_pred_val_proba)

# Print training and validation loss and accuracy
print("Training Loss: %.2f" % train_loss)
print("Training Accuracy: %.2f%%" % (train_accuracy * 100.0))
print("Validation Loss: %.2f" % val_loss)
print("Validation Accuracy: %.2f%%" % (val_accuracy * 100.0))

# Print precision, recall, and F1 score for the validation set
print(classification_report(y_val, y_pred_val.round()))



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, log_loss

# Load the data
df = pd.read_csv('ds3.csv')

# Convert categorical variables to numeric variables
le = LabelEncoder()
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].apply(lambda col: le.fit_transform(col))

# Separate features and target
X = df.drop('Response', axis=1)
y = df['Response']

# Split data into training, validation, and test sets
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)

# Initialize and fit the model on the training set
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Make predictions on the training set and calculate loss
y_pred_train = model.predict(X_train)
y_pred_train_proba = model.predict_proba(X_train)
train_accuracy = accuracy_score(y_train, y_pred_train)
train_loss = log_loss(y_train, y_pred_train_proba)

# Make predictions on the validation set and calculate loss
y_pred_val = model.predict(X_val)
y_pred_val_proba = model.predict_proba(X_val)
val_accuracy = accuracy_score(y_val, y_pred_val)
val_loss = log_loss(y_val, y_pred_val_proba)

# Print training and validation loss and accuracy
print("Training Loss: %.2f" % train_loss)
print("Training Accuracy: %.2f%%" % (train_accuracy * 100.0))
print("Validation Loss: %.2f" % val_loss)
print("Validation Accuracy: %.2f%%" % (val_accuracy * 100.0))

# Print precision, recall, and F1 score for the validation set
print(classification_report(y_val, y_pred_val))

# Make predictions on the test set
y_pred_test = model.predict(X_test)

# Display first 10 actual vs. predicted responses from the test set
first_10_actual = y_test[:10]
first_10_predicted = y_pred_test[:10]

print("First 10 Actual Responses: ", first_10_actual.values)
print("First 10 Predicted Responses: ", first_10_predicted)

# Optionally, you could display them side by side for easier comparison
for actual, predicted in zip(first_10_actual, first_10_predicted):
    print(f"Actual: {actual}, Predicted: {predicted}")


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, log_loss

# Load the data
df = pd.read_csv('ds3.csv')

# Convert categorical variables to numeric variables
le = LabelEncoder()
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].apply(lambda col: le.fit_transform(col))

# Separate features and target
X = df.drop('Response', axis=1)
y = df['Response']

# Split data into training, validation, and test sets
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)

# Initialize and fit the model on the training set
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Make predictions on the training set and calculate loss
y_pred_train = model.predict(X_train)
y_pred_train_proba = model.predict_proba(X_train)
train_accuracy = accuracy_score(y_train, y_pred_train)
train_loss = log_loss(y_train, y_pred_train_proba)

# Make predictions on the validation set and calculate loss
y_pred_val = model.predict(X_val)
y_pred_val_proba = model.predict_proba(X_val)
val_accuracy = accuracy_score(y_val, y_pred_val)
val_loss = log_loss(y_val, y_pred_val_proba)

# Print training and validation loss and accuracy
print("Training Loss: %.2f" % train_loss)
print("Training Accuracy: %.2f%%" % (train_accuracy * 100.0))
print("Validation Loss: %.2f" % val_loss)
print("Validation Accuracy: %.2f%%" % (val_accuracy * 100.0))

# Print precision, recall, and F1 score for the validation set
print(classification_report(y_val, y_pred_val))


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Load the data
df = pd.read_csv('ds3.csv')

# Convert categorical variables to numeric variables
le = LabelEncoder()
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].apply(lambda col: le.fit_transform(col))

# Separate features and target
X = df.drop('Response', axis=1)
y = df['Response']

# Normalize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split data into training, validation, and test sets
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)

# Define the model
model = Sequential()
model.add(Dense(32, input_dim=X_train.shape[1], activation='relu'))  # Input layer
model.add(Dense(16, activation='relu'))  # Hidden layer
model.add(Dense(1, activation='sigmoid'))  # Output layer

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32)

# Evaluate the model
_, train_acc = model.evaluate(X_train, y_train, verbose=0)
_, val_acc = model.evaluate(X_val, y_val, verbose=0)
print("Training Accuracy: %.2f%%" % (train_acc * 100.0))
print("Validation Accuracy: %.2f%%" % (val_acc * 100.0))

# Predict classes
# Predict probabilities
y_pred_val_probs = model.predict(X_val)

# Convert probabilities to class labels
y_pred_val = (y_pred_val_probs > 0.5).astype(int).reshape(-1)

# Print precision, recall, and F1 score for the validation set
print(classification_report(y_val, y_pred_val))


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, log_loss

# Load the data
df = pd.read_csv('ds3.csv')

# Convert categorical variables to numeric variables
le = LabelEncoder()
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].apply(lambda col: le.fit_transform(col))

# Separate features and target
X = df.drop('Response', axis=1)
y = df['Response']

# Split data into training, validation, and test sets
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)

# Initialize and fit the model on the training set
model = KNeighborsClassifier()
model.fit(X_train, y_train)

# Make predictions on the training set and calculate loss
y_pred_train = model.predict(X_train)
y_pred_train_proba = model.predict_proba(X_train)
train_accuracy = accuracy_score(y_train, y_pred_train)
train_loss = log_loss(y_train, y_pred_train_proba)

# Make predictions on the validation set and calculate loss
y_pred_val = model.predict(X_val)
y_pred_val_proba = model.predict_proba(X_val)
val_accuracy = accuracy_score(y_val, y_pred_val)
val_loss = log_loss(y_val, y_pred_val_proba)

# Print training and validation loss and accuracy
print("Training Loss: %.2f" % train_loss)
print("Training Accuracy: %.2f%%" % (train_accuracy * 100.0))
print("Validation Loss: %.2f" % val_loss)
print("Validation Accuracy: %.2f%%" % (val_accuracy * 100.0))

# Print precision, recall, and F1 score for the validation set
print(classification_report(y_val, y_pred_val))


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score, classification_report, log_loss

# Load the data
df = pd.read_csv('ds3.csv')

# Convert categorical variables to numeric variables
le = LabelEncoder()
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].apply(lambda col: le.fit_transform(col))

# Separate features and target
X = df.drop('Response', axis=1)
y = df['Response']

# Split data into training, validation, and test sets
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)

# Initialize and fit the model on the training set
model = LinearDiscriminantAnalysis()
model.fit(X_train, y_train)

# Make predictions on the training set and calculate loss
y_pred_train = model.predict(X_train)
y_pred_train_proba = model.predict_proba(X_train)
train_accuracy = accuracy_score(y_train, y_pred_train)
train_loss = log_loss(y_train, y_pred_train_proba)

# Make predictions on the validation set and calculate loss
y_pred_val = model.predict(X_val)
y_pred_val_proba = model.predict_proba(X_val)
val_accuracy = accuracy_score(y_val, y_pred_val)
val_loss = log_loss(y_val, y_pred_val_proba)

# Print training and validation loss and accuracy
print("Training Loss: %.2f" % train_loss)
print("Training Accuracy: %.2f%%" % (train_accuracy * 100.0))
print("Validation Loss: %.2f" % val_loss)
print("Validation Accuracy: %.2f%%" % (val_accuracy * 100.0))

# Print precision, recall, and F1 score for the validation set
print(classification_report(y_val, y_pred_val))


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score, classification_report, log_loss

# Load the data
df = pd.read_csv('ds3.csv')

# Convert categorical variables to numeric variables
le = LabelEncoder()
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].apply(lambda col: le.fit_transform(col))

# Separate features and target
X = df.drop('Response', axis=1)
y = df['Response']

# Split data into training, validation, and test sets
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)

# Initialize and fit the model on the training set
model = QuadraticDiscriminantAnalysis()
model.fit(X_train, y_train)

# Make predictions on the training set and calculate loss
y_pred_train = model.predict(X_train)
y_pred_train_proba = model.predict_proba(X_train)
train_accuracy = accuracy_score(y_train, y_pred_train)
train_loss = log_loss(y_train, y_pred_train_proba)

# Make predictions on the validation set and calculate loss
y_pred_val = model.predict(X_val)
y_pred_val_proba = model.predict_proba(X_val)
val_accuracy = accuracy_score(y_val, y_pred_val)
val_loss = log_loss(y_val, y_pred_val_proba)

# Print training and validation loss and accuracy
print("Training Loss: %.2f" % train_loss)
print("Training Accuracy: %.2f%%" % (train_accuracy * 100.0))
print("Validation Loss: %.2f" % val_loss)
print("Validation Accuracy: %.2f%%" % (val_accuracy * 100.0))

# Print precision, recall, and F1 score for the validation set
print(classification_report(y_val, y_pred_val))


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.gaussian_process import GaussianProcessClassifier  # changed from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, log_loss

# Load the data
df = pd.read_csv('ds3.csv')

# Convert categorical variables to numeric variables
le = LabelEncoder()
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].apply(lambda col: le.fit_transform(col))

# Separate features and target
X = df.drop('Response', axis=1)
y = df['Response']

# Split data into training, validation, and test sets
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)

# Initialize and fit the model on the training set
model = GaussianProcessClassifier()  # changed from DecisionTreeClassifier(max_depth=1)
model.fit(X_train, y_train)

# Make predictions on the training set and calculate loss
y_pred_train = model.predict(X_train)
y_pred_train_proba = model.predict_proba(X_train)
train_accuracy = accuracy_score(y_train, y_pred_train)
train_loss = log_loss(y_train, y_pred_train_proba)

# Make predictions on the validation set and calculate loss
y_pred_val = model.predict(X_val)
y_pred_val_proba = model.predict_proba(X_val)
val_accuracy = accuracy_score(y_val, y_pred_val)
val_loss = log_loss(y_val, y_pred_val_proba)

# Print training and validation loss and accuracy
print("Training Loss: %.2f" % train_loss)
print("Training Accuracy: %.2f%%" % (train_accuracy * 100.0))
print("Validation Loss: %.2f" % val_loss)
print("Validation Accuracy: %.2f%%" % (val_accuracy * 100.0))

# Print precision, recall, and F1 score for the validation set
print(classification_report(y_val, y_pred_val))


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.gaussian_process import GaussianProcessClassifier  # changed from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, log_loss
from sklearn.ensemble import AdaBoostClassifier

# Load the data
df = pd.read_csv('ds3.csv')

# Convert categorical variables to numeric variables
le = LabelEncoder()
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].apply(lambda col: le.fit_transform(col))

# Separate features and target
X = df.drop('Response', axis=1)
y = df['Response']

# Split data into training, validation, and test sets
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)

# Initialize and fit the model on the training set
model = AdaBoostClassifier()
model.fit(X_train, y_train)

# Make predictions on the training set and calculate loss
y_pred_train = model.predict(X_train)
y_pred_train_proba = model.predict_proba(X_train)
train_accuracy = accuracy_score(y_train, y_pred_train)
train_loss = log_loss(y_train, y_pred_train_proba)

# Make predictions on the validation set and calculate loss
y_pred_val = model.predict(X_val)
y_pred_val_proba = model.predict_proba(X_val)
val_accuracy = accuracy_score(y_val, y_pred_val)
val_loss = log_loss(y_val, y_pred_val_proba)

# Print training and validation loss and accuracy
print("Training Loss: %.2f" % train_loss)
print("Training Accuracy: %.2f%%" % (train_accuracy * 100.0))
print("Validation Loss: %.2f" % val_loss)
print("Validation Accuracy: %.2f%%" % (val_accuracy * 100.0))

# Print precision, recall, and F1 score for the validation set
print(classification_report(y_val, y_pred_val))
