In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

# Load the wine data
wine_data = pd.read_csv('exrc06p01_wine.csv')

# Drop rows with missing values
wine_data.dropna(inplace=True)

# Define features (X) and target (y)
X = wine_data.drop(columns=['type'])
y = wine_data['type']

# Split the data into train and test sets (70/30 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.9891696750902527
Confusion Matrix:
[[ 465   12]
 [   9 1453]]


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# Load the voice data
voice_data = pd.read_csv('exrc06p02_voice.csv')

# Encode the label column
label_encoder = LabelEncoder()
voice_data['label'] = label_encoder.fit_transform(voice_data['label'])

# Define features (X) and target (y)
X = voice_data.drop(columns=['label'])
y = voice_data['label']

# Split the data into train and test sets (70/30 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the Support Vector Machine model
model = SVC()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.6624605678233438
Confusion Matrix:
[[261 191]
 [130 369]]


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

# Load the NBA dataset
nba_data = pd.read_csv('exrc06p03_nba.csv')

# Fill missing values with the median of each column
imputer = SimpleImputer(strategy='median')
nba_data_filled = pd.DataFrame(imputer.fit_transform(nba_data.select_dtypes(include=['number'])), columns=nba_data.select_dtypes(include=['number']).columns)

# Encode the target variable
label_encoder = LabelEncoder()
nba_data_filled['TARGET_5Yrs'] = label_encoder.fit_transform(nba_data['TARGET_5Yrs'])

# Define features (X) and target (y)
X = nba_data_filled.drop(columns=['TARGET_5Yrs'])
y = nba_data_filled['TARGET_5Yrs']

# Split the dataset into train and test sets (70/30 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train_scaled, y_train)
logistic_pred = logistic_model.predict(X_test_scaled)

# Evaluate Logistic Regression model
logistic_accuracy = accuracy_score(y_test, logistic_pred)
logistic_cm = confusion_matrix(y_test, logistic_pred)

print("Logistic Regression:")
print("Accuracy:", logistic_accuracy)
print("Confusion Matrix:")
print(logistic_cm)

# Support Vector Machine
svm_model = SVC()
svm_model.fit(X_train_scaled, y_train)
svm_pred = svm_model.predict(X_test_scaled)

# Evaluate SVM model
svm_accuracy = accuracy_score(y_test, svm_pred)
svm_cm = confusion_matrix(y_test, svm_pred)

print("\nSupport Vector Machine:")
print("Accuracy:", svm_accuracy)
print("Confusion Matrix:")
print(svm_cm)


Logistic Regression:
Accuracy: 0.7213930348258707
Confusion Matrix:
[[ 78  78]
 [ 34 212]]

Support Vector Machine:
Accuracy: 0.7189054726368159
Confusion Matrix:
[[ 80  76]
 [ 37 209]]


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Load the mushroom dataset
mushroom_data = pd.read_csv('exrc06p04_mushrooms.csv')

# Separate features (X) and target (y)
X = mushroom_data.drop(columns=['class'])
y = mushroom_data['class']

# Perform one-hot encoding
encoder = OneHotEncoder(drop='first', sparse=False)
X_encoded = encoder.fit_transform(X)

# Split the dataset into train and test sets (70/30 split)
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42)

# Train a Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

# Predictions
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 1.0
Confusion Matrix:
[[1257    0]
 [   0 1181]]


In [11]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import pandas as pd

# Load the loan dataset
loan_data_path = 'loan.csv'
loan_data = pd.read_csv(loan_data_path)

# Let's view the first few rows of the dataframe and summarize missing values
loan_data.head()

loan_data.isnull().sum()

# Drop the Loan_ID column as it is not relevant for prediction
loan_data.drop('Loan_ID', axis=1, inplace=True)

# Encode binary categorical fields to 0/1
binary_cols = ['Gender', 'Married', 'Education', 'Self_Employed', 'Loan_Status']
label_encoder = LabelEncoder()
for col in binary_cols:
    # Drop rows where binary categorical column is NaN before encoding
    loan_data = loan_data.dropna(subset=[col])
    loan_data[col] = label_encoder.fit_transform(loan_data[col])

# Encode Dependents as it has a small number of unique values
# First, fill missing values with the most frequent value
most_frequent_dependent = loan_data['Dependents'].mode()[0]
loan_data['Dependents'].fillna(most_frequent_dependent, inplace=True)
loan_data = pd.get_dummies(loan_data, columns=['Dependents', 'Property_Area'], drop_first=True)

# Replace missing values in numerical columns with the median
numerical_cols = ['LoanAmount', 'Loan_Amount_Term', 'Credit_History']
loan_data[numerical_cols] = loan_data[numerical_cols].fillna(loan_data[numerical_cols].median())

# Remove outliers more than 3 standard deviations from the mean
numerical_cols.extend(['ApplicantIncome', 'CoapplicantIncome'])
for col in numerical_cols:
    mean = loan_data[col].mean()
    std_dev = loan_data[col].std()
    loan_data = loan_data[(loan_data[col] <= mean + 3 * std_dev) & (loan_data[col] >= mean - 3 * std_dev)]

# Check the modified dataframe
loan_data.head()

loan_data.info()

# Separate the target variable and features
y_loan = loan_data['Loan_Status']
X_loan = loan_data.drop('Loan_Status', axis=1)

# Split the dataset into training and testing sets
X_train_loan, X_test_loan, y_train_loan, y_test_loan = train_test_split(X_loan, y_loan, test_size=0.3, random_state=42)

# Initialize and train the logistic regression model
log_reg_loan = LogisticRegression(max_iter=1000)
log_reg_loan.fit(X_train_loan, y_train_loan)

# Given data point for prediction
data_point = pd.DataFrame({
    'Gender': [1],  # Male
    'Married': [0],  # No
    'Education': [0],  # Graduate
    'Self_Employed': [0],  # No
    'ApplicantIncome': [2400],
    'CoapplicantIncome': [2000],
    'LoanAmount': [36],
    'Loan_Amount_Term': [360],
    'Credit_History': [1],  # 1
    'Dependents_1.0': [0],
    'Dependents_2.0': [0],
    'Dependents_3.0': [0],
    'Property_Area_Semiurban': [0],
    'Property_Area_Urban': [1]  # Urban
})

# Predict the probability of Loan_Status being Yes
probability = log_reg_loan.predict_proba(data_point)[:, 1]

print(f"Probability: {probability[0]*100:.2f}%")



<class 'pandas.core.frame.DataFrame'>
Int64Index: 846 entries, 0 to 980
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Gender                   846 non-null    int64  
 1   Married                  846 non-null    int64  
 2   Education                846 non-null    int64  
 3   Self_Employed            846 non-null    int64  
 4   ApplicantIncome          846 non-null    int64  
 5   CoapplicantIncome        846 non-null    float64
 6   LoanAmount               846 non-null    float64
 7   Loan_Amount_Term         846 non-null    float64
 8   Credit_History           846 non-null    float64
 9   Loan_Status              846 non-null    int64  
 10  Dependents_1.0           846 non-null    uint8  
 11  Dependents_2.0           846 non-null    uint8  
 12  Dependents_3.0           846 non-null    uint8  
 13  Property_Area_Semiurban  846 non-null    uint8  
 14  Property_Area_Urban      8

In [22]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Load the loan dataset
loan_data_path = 'loan.csv'
loan_data = pd.read_csv(loan_data_path)

# Drop the Loan_ID column as it is not relevant for prediction
loan_data.drop('Loan_ID', axis=1, inplace=True)

# Encode binary categorical fields to 0/1
binary_cols = ['Gender', 'Married', 'Education', 'Self_Employed', 'Loan_Status']
label_encoder = LabelEncoder()
for col in binary_cols:
    loan_data[col] = label_encoder.fit_transform(loan_data[col])

# Encode Dependents as it has a small number of unique values
# First, fill missing values with the most frequent value
most_frequent_dependent = loan_data['Dependents'].mode()[0]
loan_data['Dependents'].fillna(most_frequent_dependent, inplace=True)
loan_data = pd.get_dummies(loan_data, columns=['Dependents', 'Property_Area'], drop_first=True)

# Replace missing values in numerical columns with the median
numerical_cols = ['LoanAmount', 'Loan_Amount_Term', 'Credit_History']
loan_data[numerical_cols] = loan_data[numerical_cols].fillna(loan_data[numerical_cols].median())

# Remove outliers more than 3 standard deviations from the mean
numerical_cols.extend(['ApplicantIncome', 'CoapplicantIncome'])
for col in numerical_cols:
    mean = loan_data[col].mean()
    std_dev = loan_data[col].std()
    loan_data = loan_data[(loan_data[col] <= mean + 3 * std_dev) & (loan_data[col] >= mean - 3 * std_dev)]

# Separate the target variable and features
y_loan = loan_data['Loan_Status']
X_loan = loan_data.drop('Loan_Status', axis=1)

# Split the dataset into training and testing sets
X_train_loan, X_test_loan, y_train_loan, y_test_loan = train_test_split(X_loan, y_loan, test_size=0.3, random_state=42)

# Initialize and train the logistic regression model
log_reg_loan = LogisticRegression(max_iter=1000)
log_reg_loan.fit(X_train_loan, y_train_loan)

# Given data point for prediction
data_point = pd.DataFrame({
    'Gender': [1],  # Male
    'Married': [0],  # No
    'Education': [0],  # Graduate
    'Self_Employed': [0],  # No
    'ApplicantIncome': [2400],
    'CoapplicantIncome': [2000],
    'LoanAmount': [36],
    'Loan_Amount_Term': [360],
    'Credit_History': [1],  # 1
    'Dependents_1.0': [0],
    'Dependents_2.0': [0],
    'Dependents_3.0': [0],
    'Property_Area_Semiurban': [0],
    'Property_Area_Urban': [1]  # Urban
})

# Predict the probability of Loan_Status being Yes
probability = log_reg_loan.predict_proba(data_point)[:, 1]

# print(f"Probability: {probability[0]*100:.2f}%")

print("Given Data Point for Prediction:")
print(data_point)
print("\nProbability of Loan_Status being Yes:")
print(f"{probability[0]*100:.2f}%")


Given Data Point for Prediction:
   Gender  Married  Education  Self_Employed  ApplicantIncome  \
0       1        0          0              0             2400   

   CoapplicantIncome  LoanAmount  Loan_Amount_Term  Credit_History  \
0               2000          36               360               1   

   Dependents_1.0  Dependents_2.0  Dependents_3.0  Property_Area_Semiurban  \
0               0               0               0                        0   

   Property_Area_Urban  
0                    1  

Probability of Loan_Status being Yes:
85.74%


In [23]:
import sys
sys.path.append('/home/varpha/data_analytics/lib')
from handin import handin_exrc_06
handin_exrc_06()


All the relevant .ipynb files in your current directory:

  1. exrc6_soln.ipynb



Please input the order number (the one in the beginning of the line) of the file you want to hand in:  1



Your answers to exercises 06 were handed in successfully. Thank you!
You may double check your handin by calling

    /home/varpha/data_analytics/bin/check_handin.sh

from the terminal prompt.
