In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('/content/Cleaned_WikiArt_Emotions.tsv', sep='\t')

In [None]:
df.head()

Unnamed: 0,ID,Style,Category,Artist,Title,Year,Is painting,Face/body,Ave. art rating,Positive,Negative,Other/Mixed
0,58c6237dedc2c9c7dc0de1ae,Modern Art,Impressionism,Charles Courtney Curran,In the Luxembourg Garden,1889,yes,face,2.33,1,0,0
1,577280dfedc2cb3880f28e76,Modern Art,Neo-Expressionism,Keith Haring,The Marriage of Heaven and Hell,1984,yes,body,0.7,0,1,2
2,57727f2dedc2cb3880ed5fa9,Modern Art,Post-Impressionism,Jozsef Rippl-Ronai,Uncle Piacsek in front of the Black Sideboard,1906,yes,face,1.6,2,0,0
3,58d1240cedc2c94f900fc610,Modern Art,Cubism,Vadym Meller,Monk. For the Play &#39;Mazeppa&#39;,1920,yes,face,0.82,0,0,0
4,57727de7edc2cb3880e91f26,Post Renaissance Art,Romanticism,David Wilkie,The Defence of Sarago&#231;a,1828,yes,face,1.69,1,0,0


# Preparing the dataset for modeling

## Converting year to integer
Treating Year as an integer ensures that models handle it as a distinct category rather than an arbitrary decimal value. Year represents a fixed, categorical point in time rather than a measurement on a continuous scale. As a result, integer is the best type for it.

In [None]:
# For better modeling, I want to convert Year to numeric and coerce errors to NaN
df["Year"] = pd.to_numeric(df["Year"], errors="coerce")

In [None]:
df["Year"].dtype

dtype('float64')

In [None]:
#check the conversion
df[df["Year"].isna()]

Unnamed: 0,ID,Style,Category,Artist,Title,Year,Is painting,Face/body,Ave. art rating,Positive,Negative,Other/Mixed
27,57728459edc2cb3880fdb65d,Modern Art,Lyrical Abstraction,Afro,Figure,,yes,none,0.00,0,0,1
37,57727023edc2cb3880bcfefa,Renaissance Art,Early Renaissance,Pietro Perugino,Christ Handing the Keys to St. Peter,,yes,face,1.70,3,0,0
47,5772798fedc2cb3880dbc750,Renaissance Art,Northern Renaissance,Martin Schongauer,Ecce Homo,,yes,face,0.55,0,1,0
48,57728219edc2cb3880f6b0be,Post Renaissance Art,Neoclassicism,Vieira Portuense,S&#250;plica de In&#234;s de Castro,,yes,face,1.50,0,2,0
53,57726ea4edc2cb3880b7e54e,Post Renaissance Art,Realism,Jean-Francois Millet,Harvesters Resting,,yes,face,1.80,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
4019,57727f90edc2cb3880ee35c8,Modern Art,Impressionism,Ion Andreescu,Still Life,,yes,none,1.71,1,0,0
4026,57726e50edc2cb3880b6a7c0,Renaissance Art,Northern Renaissance,Rogier van der Weyden,Pierre Bladelin Triptych,,yes,face,1.45,2,0,0
4035,57727d74edc2cb3880e7cb16,Renaissance Art,Northern Renaissance,Bernhard Strigel,Emperor Maximilian I (1459-1519),,yes,face,0.70,1,0,0
4036,57728223edc2cb3880f6bef7,Post Renaissance Art,Neoclassicism,Christoffer Wilhelm Eckersberg,Model in the Studio,,yes,face,0.27,0,0,0


In [None]:
#Create a median year for each art style to fill nan values
df["Year"] = df.groupby("Style")["Year"].transform(lambda x: x.fillna(x.median())).astype(int)

In [None]:
#After filling nan values, convert float to integer
df["Year"] = df["Year"].astype(int)

In [None]:
df["Year"].dtype

dtype('int64')

## Encoding categorical values

In [None]:
# Encoding categorical columns
df = pd.get_dummies(df, columns=["Style", "Category", "Artist"], drop_first=True)

# Check new column names to verify encoding
df.head()

Unnamed: 0,ID,Title,Year,Is painting,Face/body,Ave. art rating,Positive,Negative,Other/Mixed,"Style_Contemporary Art,Modern Art",...,Artist_Wyndham Lewis,Artist_Xul Solar,Artist_Yannoulis Chalepas,Artist_Yervand Kochar,Artist_Yves Gaucher,Artist_Yves Klein,Artist_Yves Tanguy,Artist_Zao Wou-Ki,Artist_Zaya,Artist_Zhang Xiaogang
0,58c6237dedc2c9c7dc0de1ae,In the Luxembourg Garden,1889,yes,face,2.33,1,0,0,False,...,False,False,False,False,False,False,False,False,False,False
1,577280dfedc2cb3880f28e76,The Marriage of Heaven and Hell,1984,yes,body,0.7,0,1,2,False,...,False,False,False,False,False,False,False,False,False,False
2,57727f2dedc2cb3880ed5fa9,Uncle Piacsek in front of the Black Sideboard,1906,yes,face,1.6,2,0,0,False,...,False,False,False,False,False,False,False,False,False,False
3,58d1240cedc2c94f900fc610,Monk. For the Play &#39;Mazeppa&#39;,1920,yes,face,0.82,0,0,0,False,...,False,False,False,False,False,False,False,False,False,False
4,57727de7edc2cb3880e91f26,The Defence of Sarago&#231;a,1828,yes,face,1.69,1,0,0,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
#creating an emotions column to consolidate positive/negative/other

def classify_emotion(row):
    if row["Positive"] > 0:
        return "Positive"
    elif row["Negative"] > 0:
        return "Negative"
    else:
        return "Mixed/Other"

# Apply function to create the Emotion column
df["Emotion"] = df.apply(classify_emotion, axis=1)

print(df["Emotion"].value_counts())

Emotion
Positive       1831
Mixed/Other    1747
Negative        527
Name: count, dtype: int64


In [None]:
# Define features (X) by dropping unnecessary columns
X = df.drop(columns=["ID", "Title", "Emotion"])  # Drop non-predictive columns

# Define target variable (y)
y = df["Emotion"]

In [None]:
# Check shapes
X.shape, y.shape

((4105, 1057), (4105,))

## Training data

In [None]:
# Shuffle the dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Define how much data to use for training (80%)
train_size = int(0.8 * len(df))

# split for x and y
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3284, 1057), (821, 1057), (3284,), (821,))

In [None]:
# Check data types of X_train
print(X_train.dtypes[X_train.dtypes == 'object'])

Is painting    object
Face/body      object
dtype: object


In [None]:
# Convert "yes"/"no" in 'Is painting' to 1/0
df["Is painting"] = df["Is painting"].map({"yes": 1, "no": 0})

# Convert "face"/"body" in 'Face/body' to 1/0
df["Face/body"] = df["Face/body"].map({"face": 1, "body": 0})

# Verify the conversion
print(df[["Is painting", "Face/body"]].head())

   Is painting  Face/body
0            1        NaN
1            1        NaN
2            1        1.0
3            1        1.0
4            0        NaN


In [None]:
print(df["Face/body"].unique())

[nan  1.  0.]


In [None]:
# Fill NaN values with the most common category (mode)
df["Face/body"] = df["Face/body"].fillna(df["Face/body"].mode()[0])

# Verify there are no more NaNs
print(df["Face/body"].unique())

[1. 0.]


In [None]:
#training the model
from sklearn.ensemble import RandomForestClassifier

X_train = X_train.apply(pd.to_numeric, errors="coerce")
X_test = X_test.apply(pd.to_numeric, errors="coerce")

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

#Evaluating the model

In [None]:
# Make predictions on the test set
y_pred = clf.predict(X_test)

In [None]:
# Store predictions in a DataFrame
predictions_df = X_test.copy()
predictions_df["Actual Emotion"] = y_test.values
predictions_df["Predicted Emotion"] = y_pred

# Display predictions
predictions_df.head()

Unnamed: 0,Year,Is painting,Face/body,Ave. art rating,Positive,Negative,Other/Mixed,"Style_Contemporary Art,Modern Art",Style_Modern Art,"Style_Modern Art,Post Renaissance Art",...,Artist_Yannoulis Chalepas,Artist_Yervand Kochar,Artist_Yves Gaucher,Artist_Yves Klein,Artist_Yves Tanguy,Artist_Zao Wou-Ki,Artist_Zaya,Artist_Zhang Xiaogang,Actual Emotion,Predicted Emotion
3284,1929,,,-0.08,0,0,0,False,True,False,...,False,False,False,False,False,False,False,False,Mixed/Other,Mixed/Other
3285,1948,,,1.09,0,0,0,False,True,False,...,False,False,False,False,False,False,False,False,Mixed/Other,Mixed/Other
3286,1923,,,1.3,2,0,0,False,True,False,...,False,False,False,False,False,False,False,False,Positive,Positive
3287,1868,,,1.95,1,0,0,False,False,False,...,False,False,False,False,False,False,False,False,Positive,Positive
3288,1620,,,1.8,0,1,0,False,False,False,...,False,False,False,False,False,False,False,False,Negative,Negative


In [None]:
#confusion matrix
conf_matrix = pd.crosstab(predictions_df["Actual Emotion"], predictions_df["Predicted Emotion"], rownames=["Actual"], colnames=["Predicted"])

print(conf_matrix)

Predicted    Mixed/Other  Negative  Positive
Actual                                      
Mixed/Other          364         0         0
Negative               0        95         0
Positive               0         0       362


In [None]:
# Calculate accuracy
accuracy = (predictions_df["Actual Emotion"] == predictions_df["Predicted Emotion"]).mean()
print(f"Model Accuracy: {accuracy:.2f}")

Model Accuracy: 1.00


#Input values for prediction

In [None]:
# Step 1: Take user input
year = int(input("Enter the Year of the artwork: "))
is_painting = input("Is it a painting? (yes/no): ").strip().lower()
face_body = input("Does it focus on a face or body? (face/body): ").strip().lower()
style = input("Enter the style (Renaissance Art, Post-Renaissance Art, Modern Art, and Contemporary Art): ").strip()
category = input("Enter the category (must match dataset categories exactly): ").strip()
artist = input("Enter the artist name: ").strip()

# Step 2: Convert binary categorical values to numeric
is_painting = 1 if is_painting == "yes" else 0
face_body = 1 if face_body == "face" else 0

# Step 3: Create a base DataFrame for numeric features
input_dict = {
    "Year": [year],
    "Is painting": [is_painting],
    "Face/body": [face_body]
}
input_data = pd.DataFrame(input_dict)

# Step 4: One-hot encode categorical variables by creating a blank row with all 0s
categorical_data = pd.DataFrame(0, index=[0], columns=X_train.columns)

# Step 5: Set the correct one-hot values
for col in categorical_data.columns:
    if style in col:
        categorical_data[col] = 1
    elif category in col:
        categorical_data[col] = 1
    elif artist in col:
        categorical_data[col] = 1

# Step 6: Merge numeric and categorical data
final_input = pd.concat([input_data, categorical_data], axis=1)

# Step 7: Ensure final_input matches X_train exactly
# Remove duplicate columns in both datasets before reindexing
X_train = X_train.loc[:, ~X_train.columns.duplicated()]
final_input = final_input.loc[:, ~final_input.columns.duplicated()]

#reindex final_input to match X_train
final_input = final_input.reindex(columns=X_train.columns, fill_value=0)

# Step 8: Make a prediction
prediction = clf.predict(final_input)[0]

# Step 9: Display the result
print(f"\nPredicted Emotional Response: {prediction}")

Enter the Year of the artwork: 1889
Is it a painting? (yes/no): yes
Does it focus on a face or body? (face/body): body
Enter the style (Renaissance Art, Post-Renaissance Art, Modern Art, and Contemporary Art): modern art
Enter the category (must match dataset categories exactly): post-impressionism
Enter the artist name: vincent van gogh

Predicted Emotional Response: Mixed/Other


In [None]:
# madi's model #

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

from google.colab import files

file_path = "/content/Cleaned_WikiArt_Emotions.tsv"
df = pd.read_csv(file_path, sep='\t')

def classify_emotion(row):
    emotions = {"Positive": row["Positive"], "Negative": row["Negative"], "Other/Mixed": row["Other/Mixed"]}
    return max(emotions, key=emotions.get)

df["Label"] = df.apply(classify_emotion, axis=1)

features = ["Style", "Category", "Year", "Face/body"]
target = "Label"

label_encoders = {}
for col in ["Style", "Category", "Face/body"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

df["Year"] = pd.to_numeric(df["Year"], errors="coerce")
df["Year"] = df.groupby("Style")["Year"].transform(lambda x: x.fillna(x.median())).astype(int)

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"model accuracy: {accuracy:2f}")
print("classification report:\\n", report)

model accuracy: 0.655298
classification report:\n               precision    recall  f1-score   support

    Negative       0.18      0.10      0.13       102
 Other/Mixed       0.57      0.60      0.58       205
    Positive       0.74      0.79      0.76       514

    accuracy                           0.66       821
   macro avg       0.50      0.50      0.49       821
weighted avg       0.63      0.66      0.64       821



In [None]:
#sneha



import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load data
df = pd.read_csv("/content/Cleaned_WikiArt_Emotions.tsv", sep="\t")

# Define function to assign emotion
def assign_emotion(row):
    if row["Positive"] > 0:
        return "Positive"
    elif row["Negative"] > 0:
        return "Negative"
    else:
        return "Mixed"

df["Emotion"] = df.apply(assign_emotion, axis=1)

# Keep only the required features
df_cleaned = df[["Year", "Style", "Category", "Is painting", "Face/body", "Emotion"]]

# One-hot encoding for categorical features
df_encoded = pd.get_dummies(df_cleaned, columns=["Year", "Style", "Category", "Is painting", "Face/body"])

# Split data into features and target variable
X = df_encoded.drop(columns=["Emotion"])
y = df_encoded["Emotion"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Display results
print(f"Model Accuracy: {accuracy:.2f}")
print("Classification Report:\n", report)

Model Accuracy: 0.68
Classification Report:
               precision    recall  f1-score   support

       Mixed       0.71      0.77      0.73       345
    Negative       0.21      0.10      0.13       104
    Positive       0.71      0.76      0.73       372

    accuracy                           0.68       821
   macro avg       0.54      0.54      0.53       821
weighted avg       0.64      0.68      0.66       821

