In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load your animals dataset
data = pd.read_csv("animals.csv")
print(data.head())

# Check the structure of your data
print("\nDataset info:")
print(data.info())
print("\nColumn names:")
print(data.columns.tolist())

# Since this is taxonomy data, let's create a meaningful target variable
# For example, let's predict if an animal is a mammal (most are) or something else
# First, let's see what kingdoms we have
print("\nKingdom value counts:")
print(data['Kingdom'].value_counts())

# Most animals are in Animalia, so let's create a binary target: Mammal vs Non-Mammal
# Create target variable (1 for Mammalia class, 0 for others)
data['is_mammal'] = (data['Class'] == 'Mammalia').astype(int)

# For features, we'll use encoded taxonomic information
le = LabelEncoder()

# Encode categorical taxonomic columns that might be predictive
categorical_columns = ['Kingdom', 'Phylum', 'Subphylum', 'Class', 'Order', 'Family', 'Genus']

for col in categorical_columns:
    if col in data.columns:
        data[col + '_encoded'] = le.fit_transform(data[col].fillna('Unknown'))

# Prepare features - using encoded taxonomic information
feature_columns = [col for col in data.columns if col.endswith('_encoded')]
print(f"\nUsing features: {feature_columns}")

X = data[feature_columns]
y = data['is_mammal']

print(f"\nShape of X: {X.shape}")
print(f"Shape of y: {y.shape}")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

# Train linear regression model
lrr = LinearRegression()
lrr.fit(X_train, y_train)
y_lrr_pred = lrr.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_lrr_pred)
r2 = r2_score(y_test, y_lrr_pred)

print("\nLinear Regression Results:")
print("MSE:", mse)
print("R2 Score:", r2)

# Since this is a classification problem (mammal vs non-mammal),
# we might want to use classification metrics instead
print("\nSince this is essentially a classification problem, consider using:")
print("- Logistic Regression instead of Linear Regression")
print("- Classification metrics like accuracy, precision, recall")

   Unnamed: 0           Animal Name   Kingdom    Phylum   Subphylum     Class  \
0           0           Arctic Wolf  Animalia  Chordata  Vertebrata  Mammalia   
1           1            Arctic Fox  Animalia  Chordata  Vertebrata  Mammalia   
2           2  Pink Fairy Armadillo  Animalia  Chordata  Vertebrata  Mammalia   
3           3     Sumatran Elephant  Animalia  Chordata  Vertebrata  Mammalia   
4           4                Ermine  Animalia  Chordata  Vertebrata  Mammalia   

  Subclass Infraclass        Order Superorder    Suborder Infraorder  \
0      NaN        NaN    Carnivora        NaN  Caniformia        NaN   
1      NaN        NaN    Carnivora        NaN  Caniformia        NaN   
2      NaN        NaN    Cingulata        NaN         NaN        NaN   
3      NaN        NaN  Proboscidea        NaN         NaN        NaN   
4      NaN        NaN    Carnivora        NaN  Caniformia        NaN   

           Family Superfamily       Subfamily         Genus Subgenus Tribe  \
0 