In [12]:
import pandas as pd

## Data Loading

Load the dataset, preform sanity checks

In [None]:
df = pd.read_csv('Dataset4.csv')
df.head()


In [None]:
df.info()


In [None]:
# Count the number of target classes
print("Number of target classes:", len(df['target'].unique()))

# Get columns with string (object) dtype
string_columns = df.select_dtypes(include=['object']).columns

# Print value counts for each string column
print("Value counts for string columns:")
for col in string_columns:
    print(f"\n{col}:")
    print(df[col].value_counts())



In [None]:
# Check unique values in target column
print("Unique values in target column:")
print(df['target'].unique())
print("\nValue counts in target column:")
print(df['target'].value_counts())

# Basic statistics of numeric columns
print("\nBasic statistics of numeric columns:")
print(df.describe())

# Check for missing values
print("\nMissing values in dataset:")
print(df.isnull().sum())

# Plot distribution of target variable
import matplotlib.pyplot as plt
plt.figure(figsize=(10,6))
df['target'].value_counts().plot(kind='bar')
plt.title('Distribution of Target Classes')
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

# Correlation analysis with numeric columns
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
correlation = df[numeric_cols].corr()

plt.figure(figsize=(12,8))
plt.imshow(correlation, cmap='coolwarm', aspect='auto')
plt.colorbar()
plt.xticks(range(len(correlation.columns)), correlation.columns, rotation=90)
plt.yticks(range(len(correlation.columns)), correlation.columns)
plt.title('Correlation Matrix of Numeric Features')
plt.tight_layout()
plt.show()


### Data Preprocessing

*One Hot Encoding* of string columns (except the target)

In [None]:
# Get string columns except target
string_cols = df.select_dtypes(include=['object']).columns
string_cols = [col for col in string_cols if col != 'target']

# Perform one-hot encoding
df_encoded = pd.get_dummies(df, columns=string_cols)

print("\nShape before one-hot encoding:", df.shape)
print("Shape after one-hot encoding:", df_encoded.shape)
print("\nNew columns added:", list(set(df_encoded.columns) - set(df.columns)))

# Replace original dataframe with encoded version
df = df_encoded


*Encode* the target variable to be a scalar

In [None]:
# Use LabelEncoder to convert target labels to numbers
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['target'] = le.fit_transform(df['target'])

print("\nUnique target values after encoding:")
for i, label in enumerate(le.classes_):
    print(f"{label}: {i}")


In [None]:
# Calculate correlations with target
target_correlations = df.corr()['target'].sort_values(ascending=False)

# Display top 10 most correlated features
print("\nTop 10 features most correlated with target:")
print(target_correlations[1:11])  # Exclude target's correlation with itself

# Visualize top correlations
plt.figure(figsize=(10, 6))
plt.bar(range(10), target_correlations[1:11])
plt.xticks(range(10), target_correlations[1:11].index, rotation=45, ha='right')
plt.title('Top 10 Features Most Correlated with Target')
plt.xlabel('Features')
plt.ylabel('Correlation Coefficient')
plt.tight_layout()
plt.show()
