**Part 1:**
1. Load the dataset and preprocess the reviews.

a. Convert all text to lowercase.

b. Remove non-alphabetic characters (punctuation).

c. Tokenize the reviews and remove common stopwords.

d. Apply stemming to reduce words to their root form.


---



In [None]:
import pandas as pd

path = "/content/drive/MyDrive/Concepts and technologies of AI/IMDB Dataset.csv"

df = pd.read_csv(path)

print(df.head())
print(df.info())

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize
    tokens = text.split()
    # Remove stopwords and apply stemming
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

# Apply preprocessing
df['clean_review'] = df['review'].apply(preprocess_text)

print(df[['review','clean_review','sentiment']].head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


2. Split the dataset into training and testing sets (80% training, 20% testing).

---



In [None]:
from sklearn.model_selection import train_test_split

X = df['clean_review']
y = df['sentiment'].map({'positive':1, 'negative':0})

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(len(X_train), len(X_test))

3. Use a Naive Bayes classifier to classify the reviews into positive and negative categories.

a. Implement a Bag-of-Words model using CountVectorizer.

b. Train the Naive Bayes classifier using the training set.

---



In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_bow, y_train)

y_pred = nb_classifier.predict(X_test_bow)

**Part 2:**
1. Evaluate the performance of the model using the following metrics:

a. Accuracy

b. Precision, Recall, and F1-score

c. Confusion Matrix

d. ROC-AUC Score

---



In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("Confusion Matrix:\n", conf_matrix)
print("ROC-AUC Score:", roc_auc)

# **3.1 Feature Selection using Wrapper Methods.**
**Part 1: Data Loading and Preprocessing**

1. Load the Breast Cancer Prognostic Dataset.
2. Dataset is available in Drive.
3. Perform basic exploratory data analysis (EDA) to understand the dataset:

• Summarize key statistics for each feature.

• Check for missing values and handle them appropriately.

4. Split the dataset into training (80%) and testing (20%) sets.

---



In [None]:
import pandas as pd

path = "/content/drive/MyDrive/Concept and technologies of AI/wpbc.data"
data = pd.read_csv(path, header=None)

column_names = [
    "id", "status", "time",
    "mean_radius", "mean_texture", "mean_perimeter", "mean_area",
    "mean_smoothness", "mean_compactness", "mean_concavity",
    "mean_concave_points", "mean_symmetry", "mean_fractal_dimension",
    "radius_se", "texture_se", "perimeter_se", "area_se",
    "smoothness_se", "compactness_se", "concavity_se",
    "concave_points_se", "symmetry_se", "fractal_dimension_se",
    "worst_radius", "worst_texture", "worst_perimeter",
    "worst_area", "worst_smoothness", "worst_compactness",
    "worst_concavity", "worst_concave_points",
    "worst_symmetry", "worst_fractal_dimension",
    "tumor_size", "lymph_nodes"
]

data.columns = column_names
data.head()


In [None]:
# Basic EDA
print("===== SUMMARY STATISTICS =====")
print(data.describe())

print("\n===== MISSING VALUES =====")
print(data.isnull().sum())

# Fill missing values for numeric columns only
numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].mean())


In [None]:
# Prepare target variable
data["status"] = data["status"].map({"N": 0, "R": 1})

# Drop ID column (not needed for prediction)
X = data.drop(["id", "status"], axis=1)
y = data["status"]

# Split into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("\nTraining set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

**Part 2: Apply a Wrapper Method**

1. Use Recursive Feature Elimination (RFE) with a Logistic Regression model to perform feature selection:

• Select the top 5 features that contribute the most to predicting the target variable.

• Visualize the ranking of features.

2. Train the Logistic Regression model using only the selected features.

---



In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt

# Replace '?' with NaN and convert to float
X_train = X_train.replace('?', np.nan).astype(float)
X_test = X_test.replace('?', np.nan).astype(float)

# Fill missing values with column mean
X_train = X_train.fillna(X_train.mean())
X_test = X_test.fillna(X_test.mean())

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize Logistic Regression
logreg = LogisticRegression(max_iter=1000, random_state=42)

# Apply RFE to select top 5 features
rfe = RFE(estimator=logreg, n_features_to_select=5)
rfe.fit(X_train_scaled, y_train)

# Get feature rankings
feature_ranking = pd.DataFrame({
    'Feature': X_train.columns,
    'Ranking': rfe.ranking_
}).sort_values(by='Ranking')

print("===== FEATURE RANKING =====")
print(feature_ranking)

# Visualize feature rankings
plt.figure(figsize=(10,6))
plt.barh(feature_ranking['Feature'], feature_ranking['Ranking'])
plt.xlabel("Ranking (1 = Most Important)")
plt.ylabel("Feature")
plt.title("RFE Feature Ranking")
plt.gca().invert_yaxis()  # most important on top
plt.show()

# List top 5 features
top_features = feature_ranking[feature_ranking['Ranking'] == 1]['Feature'].tolist()
print("\nTop 5 selected features:", top_features)


In [None]:
from sklearn.linear_model import LogisticRegression

# Select top features from train/test sets
X_train_top = X_train[top_features]
X_test_top = X_test[top_features]

# Initialize Logistic Regression
logreg_top = LogisticRegression(max_iter=1000, random_state=42)

# Train model
logreg_top.fit(X_train_top, y_train)

# Check training and testing accuracy
train_acc = logreg_top.score(X_train_top, y_train)
test_acc = logreg_top.score(X_test_top, y_test)

print(f"Training Accuracy: {train_acc:.4f}")
print(f"Testing Accuracy: {test_acc:.4f}")


**Part 3: Model Evaluation**

1. Evaluate the model’s performance using the testing set:

• Metrics to calculate: Accuracy, Precision, Recall, F1-Score, and ROC-AUC.

2. Compare the performance of the model trained on all features versus the model trained on the selected
features.

---



In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Model with all features
logreg_all = LogisticRegression(max_iter=1000, random_state=42)
logreg_all.fit(X_train, y_train)
y_pred_all = logreg_all.predict(X_test)
y_prob_all = logreg_all.predict_proba(X_test)[:,1]  # probability for ROC-AUC

# Model with top 5 features
y_pred_top = logreg_top.predict(X_test_top)
y_prob_top = logreg_top.predict_proba(X_test_top)[:,1]

# Evaluation metrics
def evaluate_model(y_true, y_pred, y_prob):
    return {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred),
        "Recall": recall_score(y_true, y_pred),
        "F1-Score": f1_score(y_true, y_pred),
        "ROC-AUC": roc_auc_score(y_true, y_prob)
    }

# Evaluate both models
metrics_all = evaluate_model(y_test, y_pred_all, y_prob_all)
metrics_top = evaluate_model(y_test, y_pred_top, y_prob_top)

# Print comparison
print("===== MODEL PERFORMANCE =====\n")
print("Model trained on ALL features:")
for k,v in metrics_all.items():
    print(f"{k}: {v:.4f}")

print("\nModel trained on TOP 5 features:")
for k,v in metrics_top.items():
    print(f"{k}: {v:.4f}")


**Part 4: Experiment**
1. Experiment with different numbers of selected features (e.g., top 3, top 7).
2. Discuss how feature selection affects model performance.

---



In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

# Numbers of features to experiment with
feature_numbers = [3, 5, 7]

results = {}

for n in feature_numbers:
    # Apply RFE
    rfe_exp = RFE(estimator=LogisticRegression(max_iter=1000, random_state=42),
                  n_features_to_select=n)
    rfe_exp.fit(X_train_scaled, y_train)

    # Get top n features
    top_n_features = X_train.columns[rfe_exp.support_].tolist()

    # Train Logistic Regression on selected features
    X_train_n = X_train[top_n_features]
    X_test_n = X_test[top_n_features]
    model_n = LogisticRegression(max_iter=1000, random_state=42)
    model_n.fit(X_train_n, y_train)

    # Predictions
    y_pred_n = model_n.predict(X_test_n)
    y_prob_n = model_n.predict_proba(X_test_n)[:,1]

    # Evaluate
    metrics_n = {
        "Accuracy": accuracy_score(y_test, y_pred_n),
        "Precision": precision_score(y_test, y_pred_n),
        "Recall": recall_score(y_test, y_pred_n),
        "F1-Score": f1_score(y_test, y_pred_n),
        "ROC-AUC": roc_auc_score(y_test, y_prob_n)
    }

    results[f"Top {n} features"] = {"features": top_n_features, "metrics": metrics_n}

# Print results
for key, value in results.items():
    print(f"\n===== {key} =====")
    print("Selected Features:", value["features"])
    print("Metrics:")
    for metric, score in value["metrics"].items():
        print(f"{metric}: {score:.4f}")
