In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
df=pd.read_csv('/content/covtype.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df = df.dropna()

In [None]:
df.isnull().sum()

In [None]:
df['Cover_Type'].value_counts()

In [None]:
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
corr_matrix = df[numerical_cols].corr()
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=False, cmap="coolwarm", center=0)
plt.title("Correlation Matrix for Numerical Features in Covertype")
plt.show()

In [None]:
num = df.select_dtypes(include='number')

plt.figure(figsize=(20, 8))
plt.boxplot(num.values, labels=num.columns, vert=True)
plt.xticks(rotation=90)
plt.title("Boxplot for Numeric Columns")
plt.show()


In [None]:
sc=StandardScaler()
data = sc.fit_transform(df)
df_scaled=pd.DataFrame(data,columns=df.columns)

In [None]:

num = df_scaled.select_dtypes(include='number')

plt.figure(figsize=(18, 8))                         #to clean outlier
plt.boxplot(num.values, labels=num.columns, vert=True)
plt.xticks(rotation=80)
plt.title("Boxplot for Numeric Columns")
plt.show()

In [None]:
df.duplicated().sum()

In [None]:
from sklearn.decomposition import PCA

In [None]:
X = df.drop("Cover_Type", axis=1)
y = df["Cover_Type"]

In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

In [None]:
pca_df = pd.DataFrame({
    "PC1": X_pca[:, 0],
    "PC2": X_pca[:, 1],
    "Cover_Type": y
})

In [None]:
plt.figure(figsize=(10, 7))
for c in sorted(pca_df["Cover_Type"].unique()):
    subset = pca_df[pca_df["Cover_Type"] == c]
    plt.scatter(subset["PC1"], subset["PC2"], label=f"Class {c}", alpha=0.5, s=10)

plt.title("PCA Visualization of Covertype Data")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend()
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
X = df.drop("Cover_Type", axis=1)
y = df["Cover_Type"]

In [None]:
# Standardization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# Logistic Regression
log_reg = LogisticRegression(multi_class="ovr", max_iter=1000)
log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)

In [None]:
print("Logistic Regression")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))

In [None]:
print("Precision (macro):", precision_score(y_test, y_pred_lr, average="macro"))
print("Precision (weighted):", precision_score(y_test, y_pred_lr, average="weighted"))

In [None]:
print("Recall (macro):", recall_score(y_test, y_pred_lr, average="macro"))
print("Recall (weighted):", recall_score(y_test, y_pred_lr, average="weighted"))

In [None]:
print("F1 (macro):", f1_score(y_test, y_pred_lr, average="macro"))
print("F1 (weighted):", f1_score(y_test, y_pred_lr, average="weighted"))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import numpy as np

# ----- Confusion Matrix -----
y_pred = log_reg.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [None]:
# ----- Random Forest -----
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)