<a href="https://colab.research.google.com/github/nikhil-140/ML_practice/blob/main/ML_Concepts_Practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Clean and Explore Employee Dataset

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# -----------------------------
# Step 1: Load Dataset
# -----------------------------
df = pd.read_csv("employee_practice.csv")  # <-- Fill: load CSV file
print("First 5 rows:\n", df.head())

# -----------------------------
# Step 2: Identify Features & Label and Print features columns and label name
# -----------------------------
features = df.drop("Promotion_Eligibility", axis=1)
label = df["Promotion_Eligibility"]

print("\nFeatures:\n", features)  # <-- Fill: print feature columns
print("Label:\n", label)   # <-- Fill: print label name

# -----------------------------
# Step 3: Handle Missing Values
# -----------------------------
print("\nMissing Values Before:\n", df.isnull().sum())  # <-- Fill: check missing values

# Fill numerical missing values with median
df["Age"] = df["Age"].fillna(df["Age"].median())  # <-- Fill: handle missing Age
df["Years_Experience"] = df["Years_Experience"].fillna(df["Years_Experience"].median())  # <-- Fill: handle missing Years_Experience

# Fill categorical missing values with mode
df["Department"] = df["Department"].fillna(df["Department"].mode([1]))  # <-- Fill: handle missing Department
df["Promotion_Eligibility"] = df["Promotion_Eligibility"].fillna(df["Promotion_Eligibility"].mode([1]))  # <-- Fill: handle missing Promotion_Eligibility

print("\nMissing Values After:\n", df.isnull().sum())

# -----------------------------
# Step 4: Detect & Handle Outliers (Simple Method)
# -----------------------------
# Identify Salary outliers (e.g., Salary < 30000 or Salary > 150000)
salary_outliers = df[(df["Salary"] < 30000) | (df["Salary"] > 150000)]  # <-- Fill: set low and high threshold values
print("\nSalary Outliers:\n", salary_outliers)

# Remove Salary outliers
df = df[(df["Salary"] > 30000) & (df["Salary"] < 150000)]

# Handle Years_Experience outliers (remove > 30 years)
df = df[(df["Years_Experience"]) <= 30]


# -----------------------------
# Step 5: Handling Outliers
# -----------------------------
#print("\nDescriptive Statistics:\n", get.descriptive())  # <-- Fill: get descriptive stats
print("\nEmployees per Department:\n", df["Department"].count())  # <-- Fill: count employees
print("\nRest Employees: ", df)

plt.figure(figsize=(6,4))
plt.boxplot(df["Salary"])  # <-- Fill: column to visualize "Salary"
plt.title("Salary Distribution After Handling Outliers")
plt.ylabel(df["Salary"])   # <-- Fill: y-axis label ("Salary")
plt.savefig("salary_boxplot.png")  # Save the image
plt.show()



Data Preparation and Splitting

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

# Step 1: Load Dataset
df = pd.read_csv("employees.csv")   # <-- Fill: load employees.csv
print("First 5 rows:\n", df.head())

# Step 2: Features & Label
X = df.drop("Promotion_Eligibility", axis=1)  # Features
y = df["Promotion_Eligibility"]               # Label

print("\nFeatures:\n", X)       # <-- Fill
print("\nLabel:\n", y)          # <-- Fill

# Step 3: Encoding Categorical Data
# Label Encoding: Gender
df["Gender_Label"] = df["Gender"].map({"Male": 0, "Female": 1})  # <-- Fill: Male/0, Female/1
print("\nAfter Label Encoding Gender:\n", df[["Name","Gender","Gender_Label"]].head())

# One-Hot Encoding: Department
df_onehot = pd.get_dummies(df, columns=["Department"])  # <-- Fill: column to one-hot encode
print("\nAfter One-Hot Encoding Department:\n", df_onehot.head())

# Step 4: Feature Scaling
# Normalization
minmax = MinMaxScaler()
X_normalized = minmax.fit_transform(df[["Age", "Salary", "Years_Experience"]])         # <-- Fill: transform X numeric columns
print("\nAfter Normalization:\n", X_normalized[:5])

# Standardization
scaler = StandardScaler()
X_standardized = scaler.fit_transform(df[["Age", "Salary", "Years_Experience"]])               # <-- Fill: transform X numeric columns
print("\nAfter Standardization:\n", X_standardized[:5])

# Step 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.2, random_state = 42
)   # <-- Fill: use train_test_split
print("\nTrain shape:", X_train.shape, y_train.shape)
print("Test shape:", X_test.shape, y_test.shape)


Linear Regression - Practice Problem

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression

# Step 1: Load dataset
df = pd.read_csv("employee_study.csv")

# Step 2: Select feature and label
X = df[["Hours_Studied"]]
y = df[["Exam_Score"]]

# Step 3: Create the model
model = LinearRegression()

# Step 4: Train the model
model.fit(X, y)

# ✅ Step 5: Check prediction output for known input (training data)
predicted_known = model.predict(X)
print("Predictions for known inputs:")
print(predicted_known)

# Step 6: Make prediction for a new input (6 hours)
predicted_score = model.predict(pd.DataFrame([[6]], columns=["Hours_Studied"]))
print("Prediction for 6 hours studied:")
print(predicted_score)


Polynomial Regression - Practice Problem

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

# Step 1: Input data
speed = [[20], [40], [60], [80], [100]]     # Features
mileage = [8, 15, 22, 20, 15]               # Labels

# Step 2: Transform into polynomial features
poly = PolynomialFeatures(degree=2)
speed_poly = poly.fit_transform(speed)

# Step 3: Create the model
model = LinearRegression()

# Step 4: Train the model
model.fit(speed_poly, mileage)

# Step 5: Make prediction for 120 km/h
predicted_mileage = model.predict(poly.transform([[120]]))
print("Predicated mileage at 120 km/h: ")
print(predicted_mileage)  # What do you think it will print?


Mean Absolute Error (MAE) - Practice Problem

In [None]:
from sklearn.metrics import mean_absolute_error

# Actual vs Predicted values
y_true = [30, 45, 50, 65, 80]
y_pred = [28, 50, 47, 60, 85]

# Calculate MAE
mae = mean_absolute_error(y_true , y_pred)
print("MAE:", mae)


Mean Squared Error(MSE) - Practice Problem

In [None]:
import math
from sklearn.metrics import mean_squared_error

# Step 1: Actual and Predicted Scores
y_true = [72, 88, 95, 60]      # Actual values
y_pred = [70, 90, 92, 65]      # Predicted values

# Step 2: Calculate MSE
mse = mean_squared_error(y_true, y_pred)  # <-- Calculate MSE here using sklearn
rmse = math.sqrt(mse)  # <-- Calculate RMSE

print("Mean Squared Error (MSE):", round(mse, 2))
print("Root Mean Squared Error (RMSE):", round(rmse, 2))


R Square - Practice Problem

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Step 1: Create a dataset
data = {
    "Sleep_Hours": [4, 5, 6, 7, 8],
    "Productivity_Score": [50, 60, 65, 75, 85]
}
df = pd.DataFrame(data)

# Features (X) and Labels (y)
X = df[["Sleep_Hours"]]   # Fill the feature column name
y = df["Productivity_Score"]     # Fill the label column name

# Step 2: Train Linear Regression model
model = LinearRegression()   # Fill the model class
model.fit(X, y)   # Train the model

# Step 3: Make predictions
y_pred = model.predict(X)   # Make predictions

# Step 4: Calculate R² score
r2 = r2_score(y, y_pred)   # Fill the evaluation function

print("Actual Scores:", list(y))
print("Predicted Scores:", list(y_pred.round(2)))
print("R² Score:", round(r2, 2))


Overfitting and Underfitting - Practice Problem

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

# Step 0: New dataset
X = np.array([2, 4, 6, 8, 10, 12, 14]).reshape(-1, 1)  # Hours studied
y = np.array([30, 45, 55, 65, 70, 80, 85])             # Exam scores

# Create test points for smooth curve plotting
X_test = np.linspace(0, 15, 100).reshape(-1, 1)

# Function to plot model predictions
def plot_model(degree, title, subplot):
    # Step 1: Create polynomial features
    poly = PolynomialFeatures(degree=degree)    #TODO Initialize polynomial features
    X_poly = poly.fit_transform(X)              #TODO Transform X into polynomial features

    # Step 2: Train Linear Regression model
    model = LinearRegression()              #TODO Initialize the model
    model.fit(X_poly, y)                        #TODO Train the model

    # Step 3: Make predictions
    X_test_poly = poly.transform(X_test)        #TODO Transform test points
    y_pred = model.predict(X_test_poly)         #TODO Predict values

    # Step 4: Plot
    plt.subplot(1, 3, subplot)
    plt.scatter(X, y, color="red", label="Data Points")
    plt.plot(X_test, y_pred, color="blue", label=f"Degree {degree} Fit")
    plt.title(title)
    plt.xlabel("Study Hours")
    plt.ylabel("Exam Score")
    plt.legend()

def main():
    # Plot all three scenarios
    plt.figure(figsize=(15, 4))

    # Underfitting: Degree 1 (Linear)
    plot_model(degree=1, title="Underfitting (Linear)", subplot=1)

    # Good Fit: Degree 2 (Quadratic)
    plot_model(degree=2, title="Good Fit (Quadratic)", subplot=2)

    # Overfitting: Degree 6 (High Polynomial)
    plot_model(degree=6, title="Overfitting (6th Degree)", subplot=3)

    plt.tight_layout()
    plt.savefig("fitting_examples.png", dpi=300)  # Save as high-quality PNG
    plt.close()  # Close the figure to free memory
    print("Plots saved as fitting_examples.png")

if __name__ == "__main__":
    main()


Linear Regression Project

In [None]:
# Linear Regression Practice Problem with Plot and Main Function

import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

def linear_regression_task(csv_file="students.csv", predict_hours=5, save_plot="regresion_plot.png"):
    """
    Performs Linear Regression on a dataset, predicts score, calculates MSE, and plots the regression line.
    Returns:
        predicted_score (float): Predicted score for given hours
        mse (float): Mean Squared Error of the model
    """

    # Step 1: Load the dataset
    df = pd.read_csv("students.csv")  # Fill CSV filename

    # Step 2: Define features (X) and target (y)
    X = df[["Hours_Studied"]]  # Fill feature column
    y = df["Exam_Score"]    # Fill target column

    # Step 3: Create the linear regression model
    model = LinearRegression()

    # Step 4: Fit the model
    model.fit(X, y)  # Fill method to train the model

    # Step 5: Make prediction
    predicted_score = model.predict([[predict_hours]])[0]  # Fill method to predict
    print(f"Predicted exam score for studying {predict_hours} hours is: {predicted_score}")

    # Step 6: Calculate Mean Squared Error
    y_pred = model.predict(X)  # Fill method to get predictions for MSE
    mse = mean_squared_error(y, y_pred)
    print(f"Mean Squared Error of the model: {mse}")

    # Step 7: Plot the data points and regression line
    plt.scatter(X, y, color="blue", label="Actual Data")
    plt.plot(X, y_pred, color="red", label="Regression Line")  # Fill predicted line
    plt.xlabel("Hours Studied")
    plt.ylabel("Exam Score")
    plt.title("Linear Regression: Exam Score vs Hours Studied")
    plt.legend()
    plt.savefig(save_plot)  # Save the plot as an image
    plt.close()

    return predicted_score, mse

# Main function to run when script is executed directly
if __name__ == "__main__":
    linear_regression_task()


Polynomial Regression Project

In [None]:
# Polynomial Regression Practice Problem with Plot and Main Function

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

def polynomial_regression_task(csv_file="students.csv", degree=2, predict_hours=5, save_plot="regression_plot.png"):
    """
    Performs Polynomial Regression on a dataset, predicts score, calculates MSE, and plots the regression curve.

    Returns:
        predicted_score (float): Predicted score for given hours
        mse (float): Mean Squared Error of the model
    """

    # Step 1: Load the dataset
    df = pd.read_csv("students.csv")  # Fill CSV filename

    # Step 2: Define features (X) and target (y)
    X = df[["Hours_Studied"]]  # Fill feature column
    y = df["Exam_Score"]    # Fill target column

    # Step 3: Create polynomial features
    poly = PolynomialFeatures(degree=degree)
    X_poly = poly.fit_transform(X)

    # Step 4: Create the linear regression model
    model = LinearRegression()

    # Step 5: Fit the model
    model.fit(X_poly, y)  # Fill method to train the model

    # Step 6: Make prediction
    predicted_score = model.predict(poly.transform([[predict_hours]]))[0]
    print(f"Predicted exam score for studying {predict_hours} hours is: {predicted_score}")

    # Step 7: Calculate Mean Squared Error
    y_pred = model.predict(X_poly)  # Fill method to get predictions for MSE
    mse = mean_squared_error(y, y_pred)
    print(f"Mean Squared Error of the model: {mse}")

    # Step 8: Plot the data points and polynomial regression curve
    plt.scatter(X, y, color="blue", label="Actual Data")

    # Smooth curve for plotting
    X_grid = np.linspace(min(X.values), max(X.values), 100).reshape(-1, 1)
    plt.plot(X_grid, model.predict(poly.transform(X_grid)), color="red", label="Polynomial Regression Curve")

    plt.xlabel("Hours Studied")
    plt.ylabel("Exam Score")
    plt.title("Polynomial Regression: Exam Score vs Hours Studied")
    plt.legend()
    plt.savefig(save_plot)
    plt.close()

    return predicted_score, mse

# Main function to run when script is executed directly
if __name__ == "__main__":
    polynomial_regression_task()


Logistic Regression - Practice Problem

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression

# Dataset
data = {
    "Sleep_Hours": [3, 4, 5, 6, 7, 8],
    "Alert": [0, 0, 0, 1, 1, 1]
}

# TODO: Create DataFrame from dictionary
df = pd.DataFrame(data)

# TODO: Select features (X) and labels (y)
X = df[["Sleep_Hours"]]   # Feature
y = df["Alert"]     # Label

# TODO: Create Logistic Regression model
model = LogisticRegression()
model.fit(X, y)   # Train the model

# TODO: Make predictions
predictions = model.predict(X)
print("Predictions:", predictions)

# TODO: Get probability estimates
probabilities = model.predict_proba(X)
print("Probabilities:\n", probabilities)


From Score to Probability - Practice Problem

In [None]:
import numpy as np

# Step 1: Define the sigmoid function
def sigmoid(x):
    # TODO: Complete the formula for sigmoid
    return 1/(1+np.exp(-x))

# Step 2: Example raw scores (like the linear step inside Logistic Regression)
scores = np.array([-3.0, -0.5, 0.5, 2.5, 4.0])

# Step 3: Convert scores to probabilities using sigmoid
# TODO: Call the sigmoid function on scores
probabilities = sigmoid(scores)

# Step 4: Apply threshold (0.5) to get final predictions
# TODO: Convert probabilities into 0 or 1 based on threshold
predictions = (probabilities >= 0.5).astype(int)

# Display results
for s, p, pred in zip(scores, probabilities, predictions):
    # TODO: Complete the print statement
    print(f"Score: {s:>4} -> Probability: {p:.2f} -> Prediction: {pred}")


Accuracy with Code - Practice Problem

In [None]:
from sklearn.metrics import accuracy_score

# Step 1: Define the actual labels (ground truth)
# TODO: Fill in the actual labels
y_true = [1, 0, 1, 1, 0, 1, 0, 1, 0, 1]

# Step 2: Define the predictions from the model
# TODO: Fill in the predicted labels
y_pred = [1, 0, 1, 0, 0, 1, 0, 1, 0, 1]

# Step 3: Calculate accuracy
# TODO: Call accuracy_score with y_true and y_pred
acc = accuracy_score(y_true,y_pred)

# Step 4: Print the accuracy
# TODO: Complete the print statement
print("Accuracy:", acc)


Precision and Recall - Practice Problem

In [None]:
from sklearn.metrics import precision_score, recall_score

# TODO: Define the actual labels and predicted labels
y_true = [0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0]
y_pred = [1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1]

precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)

print("Precision:", precision)
print("Recall:", recall)


F1-Score - Practice Problem

In [None]:
# TODO: Import the required function
from sklearn.metrics import f1_score  # TODO

# Actual labels (ground truth)
y_true = [1, 0, 1, 1, 0, 1, 0, 1, 0, 1]  # TODO: Fill the actual outcomes

# Predictions from our model
y_pred = [1, 0, 1, 0, 0, 1, 0, 1, 0, 1]  # TODO: Fill the predicted outcomes

# Calculate F1 score
f1 = f1_score(y_true, y_pred)  # TODO: Use the correct function

# Print result
print("F1-score:", f1)  # TODO: Ensure correct print format



Project - Predict Students Result

In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Step 1: Load dataset
def load_data(path):
    # TODO: Read the CSV file using pandas
    df = pd.read_csv("students.csv")

    # TODO: Select features (Hours_Studied should be a DataFrame, not Series)
    X = df[['Hours_Studied']]

    # TODO: Select target column (Exam_Pass)
    y = df['Exam_Pass']

    return X, y


# Step 2: Train logistic regression
def train_model(X, y):
    # TODO: Split the dataset into training and testing (30% test, random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # TODO: Create logistic regression model
    model = LogisticRegression()

    # TODO: Train the model
    model.fit(X_train, y_train)

    return model, X_test, y_test


# Step 3: Evaluate predictions
def evaluate_model(model, X_test, y_test):
    # TODO: Generate predictions on the test set
    y_pred = model.predict(X_test)

    # TODO: Calculate Accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # TODO: Calculate Precision
    precision = precision_score(y_test, y_pred)

    # TODO: Calculate Recall
    recall = recall_score(y_test, y_pred)

    # TODO: Calculate F1 Score
    f1 = f1_score(y_test, y_pred)

    return accuracy, precision, recall, f1


# Step 4: Display results
def display_results(model, accuracy, precision, recall, f1):
    print("Model used:", type(model).__name__)

    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)



# Step 5: Main workflow
if __name__ == "__main__":
    # TODO: Provide the dataset path
    path = "students.csv"

    # TODO: Load dataset
    X, y = load_data(path)

    # TODO: Train logistic regression
    model, X_test, y_test = train_model(X, y)

    # TODO: Evaluate metrics
    accuracy, precision, recall, f1 = evaluate_model(model, X_test, y_test)

    # TODO: Print results
    display_results(model, accuracy, precision, recall, f1)


Decision Tree - Practice Problem

In [None]:
# Import pandas for handling CSV/dataframes
import pandas as pd

# Import DecisionTreeClassifier from sklearn
from sklearn.tree import DecisionTreeClassifier


# Step 1: Create the dataset
data = {
    "Study_Hours": [2, 3, 4, 5, 6],    # Hours studied
    "Attendance": [60, 70, 80, 85, 90],  # Attendance percentages
    "Pass": [0, 0, 1, 1, 1]             # Target labels (0 = Fail, 1 = Pass)
}

# TODO: Convert the dictionary to a pandas DataFrame
df = pd.DataFrame(data)

# Step 2: Separate features and label
# TODO: Select Study_Hours and Attendance as features (X)
X = df[["Study_Hours", "Attendance"]]

# TODO: Select Pass as the target label (y)
y = df["Pass"]

# Step 3: Train the Decision Tree model
# TODO: Create a DecisionTreeClassifier with random_state=42
model = DecisionTreeClassifier(random_state=42)

# TODO: Fit the model on the training data
model.fit(X, y)

# Step 4: Make predictions on the training data
# TODO: Generate predictions
predictions = model.predict(X)

# TODO: Print the predictions as a list
print("Predictions:", predictions.tolist())


Random Forest - Worked Example

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier


data = {
    "Study_Hours": [2, 3, 4, 5, 6, 2, 3, 4, 5, 6, 7, 8, 9],
    "Attendance": [60, 65, 70, 75, 80, 62, 68, 72, 78, 85, 88, 90, 95],
    "Pass":        [0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1]
}

df = pd.DataFrame(data)

X = df[["Study_Hours", "Attendance"]]
y = df["Pass"]

model = RandomForestClassifier(
    n_estimators=100,
    max_depth=3,
    min_samples_leaf=2,
    random_state=42
)
model.fit(X, y)
predictions = model.predict(X)

# Predict probabilities
probabilities = model.predict_proba(X)

# Round to 1 decimal
probabilities = np.round(probabilities, 1)

#Print the Predicted Probabilities
print("Predictions:", predictions.tolist())

# Print with comments
for row in probabilities:
    fail, pass_ = row
    print(f"[{fail:.1f} {pass_:.1f}]   # {int(fail*100)}% Fail, {int(pass_*100)}% Pass")


Naive Bayes - Practice Problem

In [None]:
# TODO: Import pandas for handling CSV
import pandas as pd

# TODO: Import CountVectorizer and MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# Step 1: Load dataset
# TODO: Load your CSV file "student_notifications.csv"
df = pd.read_csv("student_notifications.csv")

# Step 2: Convert text into numeric features
# TODO: Initialize CountVectorizer
vectorizer = CountVectorizer()

# TODO: Transform the messages into features
X = vectorizer.fit_transform(df["Message"])

# Step 3: Extract target labels
# TODO: Select the target column
y = df["Spam"]

# TODO: Print feature names
print("Feature Names:", vectorizer.get_feature_names_out(), "\n")

# Step 4: Train the Naive Bayes model
# TODO: Initialize MultinomialNB
model = MultinomialNB()

# TODO: Train the model
model.fit(X, y)

# Step 5: Make predictions
# TODO: Predict on the same dataset
predictions = model.predict(X)
print("Predictions:", predictions.tolist())

# Step 6: Get probability estimates
# TODO: Predict probabilities for each message
probabilities = model.predict_proba(X)

# Step 7: Print probabilities with comments
for msg, row in zip(df["Message"], probabilities):
    not_spam, spam = row
    print(f"Message: {msg}\n[{not_spam:.2f} {spam:.2f}]   # Probabilities\n")


Project – Creditcard Fraud Detection

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


def train_credit_card_fraud_model():
    """
    Loads the dataset, scales features, splits into train-test sets,
    trains MultinomialNB, Decision Tree, and Random Forest,
    and returns key variables for testing.
    """

    # Step 1: Load Dataset
    df = pd.read_csv("creditcard.csv")          # TODO Completed

    # Step 2: Prepare Features and Target
    X = df.drop("Class", axis=1)                # TODO: all columns except target
    y = df["Class"]                             # TODO: target column (0 Not Fraud, 1 Fraud)

    # Step 3: Scale Features to [0, 1]
    scaler = MinMaxScaler()                     # TODO: Initialize MinMaxScaler
    X_scaled = scaler.fit_transform(X)          # TODO: Fit and transform X

    # Step 4: Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled,
        y,
        test_size=0.2,                          # TODO: test_size
        random_state=42,
        stratify=y                              # Maintain fraud ratio
    )

    # Step 5: Define Models
    models = {
        "MultinomialNB": MultinomialNB(),               # TODO
        "DecisionTree": DecisionTreeClassifier(),       # TODO
        "RandomForest": RandomForestClassifier()        # TODO
    }

    # Step 6: Train and Evaluate
    for name, model in models.items():
        model.fit(X_train, y_train)                     # TODO: Train the model
        preds = model.predict(X_test)                   # TODO: Make predictions
        acc = accuracy_score(y_test, preds)             # TODO: Calculate accuracy

        print(f"{name} Accuracy: {acc * 100:.2f}%")

    # Return important variables for testing
    return models, scaler, X_train, X_test, y_train, y_test


# Run training only when executing script directly
if _name_ == "_main_":
    train_credit_card_fraud_model()


Project – Clean IMBD Movie Dataset

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

# ---------------------------------------------------------
# Initial Setup: NLTK Stopwords
# ---------------------------------------------------------
try:
    STOP_WORDS = set(stopwords.words('english'))
except LookupError:
    print("Downloading NLTK stopwords...")
    nltk.download('stopwords')
    STOP_WORDS = set(stopwords.words('english'))


# ---------------------------------------------------------
# Function: clean_text
# ---------------------------------------------------------
def clean_text(text: str) -> str:
    """
    Cleans a raw string of text by performing:
      1. Remove HTML tags
      2. Convert to lowercase
      3. Keep only letters (a-z) and spaces
      4. Tokenize into words
      5. Remove stopwords
      6. Join cleaned words back into a string
    """
    if not isinstance(text, str):
        return ""

    # 1. Remove HTML tags
    text = re.sub(r'<.*?>', ' ', text)

    # 2. Convert to lowercase
    text = text.lower()

    # 3. Keep only alphabets and spaces
    text = re.sub(r'[^a-z ]', ' ', text)

    # 4. Tokenize
    words = text.split()

    # 5. Remove stopwords
    words = [word for word in words if word not in STOP_WORDS]

    # 6. Join back
    cleaned_text = " ".join(words)

    return cleaned_text


# ---------------------------------------------------------
# Main Execution Pipeline
# ---------------------------------------------------------
def main():

    INPUT_FILE = 'IMDB Dataset.csv'
    OUTPUT_FILE = 'cleaned_imdb_dataset.csv'

    print(f"Loading data from '{INPUT_FILE}'...")

    # 1. Load dataset
    try:
        df = pd.read_csv(INPUT_FILE)
    except FileNotFoundError:
        print(f"Error: Dataset not found at '{INPUT_FILE}'.")
        print("Please download it and place it in the correct directory.")
        return

    # 2. Apply cleaning function
    print("Applying text cleaning function to all reviews...")
    df['cleaned_review'] = df['review'].apply(clean_text)

    # 3. Keep only cleaned_review + sentiment
    final_df = df[['cleaned_review', 'sentiment']]

    # 4. Save output
    print(f"Saving cleaned data to '{OUTPUT_FILE}'...")
    final_df.to_csv(OUTPUT_FILE, index=False)

    print("Data cleaning process complete.")
    print(f"Cleaned data saved to '{OUTPUT_FILE}'")


# ---------------------------------------------------------
# Script Entry Point
# ---------------------------------------------------------
if _name_ == "_main_":
    main()


Building and Saving the Sentiment Model

In [None]:
import pandas as pd
import re
import pickle

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# File Paths
INPUT_FILE = 'cleaned_imdb_dataset.csv'
OUTPUT_FILE = 'sentiment_model.pkl'


# -------------------------------------------------------------
# MODEL TRAINING FUNCTION
# -------------------------------------------------------------
def train_model():
    """Main function to train and save the sentiment analysis model."""

    print("--- Starting Model Training ---")

    try:
        # Task 1: Load the Data
        df = pd.read_csv(INPUT_FILE)

        # Task 2: Prepare Features and Target
        X = df['cleaned_review']
        y = df['sentiment']

        # Task 3: Train-Test Split (80/20)
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        # Task 4: Vectorize the Text
        vectorizer = CountVectorizer()
        X_train_vec = vectorizer.fit_transform(X_train)
        X_test_vec = vectorizer.transform(X_test)

        # Task 5: Train the Model
        model = MultinomialNB()
        model.fit(X_train_vec, y_train)

        # Task 6: Evaluate Performance
        predictions = model.predict(X_test_vec)
        accuracy = accuracy_score(y_test, predictions)
        print(f"Model Accuracy: {accuracy * 100:.2f}%")

        # Task 7: Save Vectorizer + Model as Artifacts
        artifacts = {
            'vectorizer': vectorizer,
            'model': model
        }

        with open(OUTPUT_FILE, 'wb') as f:
            pickle.dump(artifacts, f)

        print(f"--- Model and Vectorizer saved successfully to '{OUTPUT_FILE}' ---")

    except FileNotFoundError:
        print(f"Error: The input file '{INPUT_FILE}' was not found.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")


# -------------------------------------------------------------
# TEXT CLEANING FOR PREDICTION
# -------------------------------------------------------------
def clean_text(text):
    """A simple function to clean text before prediction."""
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # remove punctuation & numbers
    return text


# -------------------------------------------------------------
# PREDICTION EXAMPLE
# -------------------------------------------------------------
def run_prediction_example():
    """Loads the saved model and runs predictions on sample reviews."""

    print("\n--- Running Prediction Example ---")

    try:
        with open(OUTPUT_FILE, 'rb') as f:
            artifacts = pickle.load(f)

        vectorizer = artifacts['vectorizer']
        model = artifacts['model']

        sample_reviews = [
            "This movie was absolutely fantastic, one of the best I have seen all year!",
            "A complete waste of time. The plot was boring and the acting was terrible.",
            "It was an okay film, not great but not bad either."
        ]

        for review in sample_reviews:
            cleaned_review = clean_text(review)
            review_vec = vectorizer.transform([cleaned_review])
            prediction = model.predict(review_vec)[0]

            print(f"\nReview: '{review}'")
            print(f"Predicted Sentiment: {prediction}")

    except FileNotFoundError:
        print(f"Could not load '{OUTPUT_FILE}'. Please train the model first.")
    except Exception as e:
        print(f"An error occurred during prediction: {e}")


# -------------------------------------------------------------
# SCRIPT ENTRY POINT
# -------------------------------------------------------------
if _name_ == "_main_":
    train_model()
    run_prediction_example()


Part 1 - Titanic Survival Prediction

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Load training data
data = pd.read_csv("Titanic.csv")

# Keep only weaker baseline features
features = ["Pclass", "Sex"]
target = "Survived"
data = data[features + [target]]

# Convert categorical to numeric
data["Sex"] = data["Sex"].map({"male": 0, "female": 1})

# Remove missing values
data = data.dropna()

# Split into features and target
X = data[features]
y = data[target]

# Optional: Train-Test Split for fair evaluation
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train logistic regression (low C → stronger regularization → lower accuracy)
model = LogisticRegression(max_iter=500, C=0.1, random_state=42)
model.fit(X_train, y_train)

# Predictions
preds = model.predict(X_test)

# Compute accuracy
accuracy = accuracy_score(y_test, preds)
print(f"Baseline Logistic Regression Accuracy: {accuracy * 100:.2f}%")


Part 2 - Build a Better Titanic Prediction Model

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
# from sklearn.ensemble import RandomForestClassifier  # Optional upgrade

def preprocess(df):
    # Nothing to drop, only encode categorical if needed
    # Here 'Sex' is already numeric (0/1), so nothing to encode
    # Fill missing numeric values with median
    df = df.fillna(df.median(numeric_only=True))
    return df

def train_and_evaluate():
    # Load training data
    train_data = pd.read_csv("Titanic_train.csv")

    # Separate features and target
    y_train = train_data["Survived"]
    X_train = preprocess(train_data.drop(columns=["Survived"]))

    # Train Decision Tree
    model = DecisionTreeClassifier(max_depth=5, random_state=42)
    model.fit(X_train, y_train)

    # Load test data
    test_data = pd.read_csv("Titanic_test.csv")
    X_test = preprocess(test_data)

    # Predict
    preds = model.predict(X_test)

    return preds

if __name__ == "__main__":
    preds = train_and_evaluate()

    # This will calculate the accuracy of your code
    true_labels = pd.read_csv("Titanic_test_labels.csv")["Survived"]
    acc = accuracy_score(true_labels, preds)

    print(f"The model accuracy: {acc*100:.2f}%")


K-Means on a Toy Dataset - Practice Problem

In [None]:
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Step 1: Generate a toy dataset
# TODO: Create a dataset with 150 samples and 4 clusters
X, _ = make_blobs(n_samples=100, centers=4, random_state=42)

# Step 2: Train KMeans
# TODO: Initialize KMeans with 4 clusters
kmeans = KMeans(n_clusters=4, random_state=42)

# TODO: Fit the model and get cluster labels
labels = kmeans.fit_predict(X)

# Step 3: Visualize the clusters
# TODO: Plot the points with cluster colors
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis')

# TODO: Plot the cluster centers (centroids)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1],
            c='red', marker='X', s=200, label="Centroids")

plt.legend()
plt.savefig("kmeans_practice.png")
plt.close()
