# Solution for the Predict Prices task

### Task 1: Load the dataset and calculate the average price and estimated owners, and output to a file named output_1.csv

In [118]:
import math

# Import necessary libraries
import pandas as pd
from typing import Any, Tuple
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import csv 
import joblib
from dateutil import parser

# This function loads the dataset from the given path and returns the features and target variables, as well as the average price, estimated owners, and unique genres
# During evaluation set, the is_eval_dataset flag should be set to True, and the known_genres should be passed as a parameter to ensure the same one-hot encoding is used
def load_data(dataset_path: str, is_eval_dataset=False, known_genres=None) -> Tuple[Any, Any, int, int, set]:
    # Remove columns which you consider not relevant for price prediction
    columns_to_drop = ["AppID", "Name", "Recommendations", "Publishers"]
    
    avg_price = 0
    avg_owners = 0
    
    # Note that to do a one-hot encoding, we need to know all possible genres
    # First pass will get all genres, then we will create the one-hot encoding
    unique_genres = set() if known_genres is None else known_genres

    # Load dataset from CSV file
    dataset = []
    with open(dataset_path, "r", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        for row in reader:
            # First, remove columns that are not relevant for price prediction
            for col in columns_to_drop:
                row.pop(col)
                
            # Extract release year from release_date, since it is more relevant for price prediction
            row["Release date"] = parser.parse(row["Release date"]).year
            dataset.append(row)

            # Add genres to unique_genres set            
            genres = row["Genres"].split(",") if row["Genres"] else []
            unique_genres.update(genres)
            
            # Convert Estimated owners to numerical range
            owners_range = row["Estimated owners"].split("-")
            row["Estimated owners"] = int((int(owners_range[0]) + int(owners_range[1])) / 2) if "-" in row["Estimated owners"] else int(owners_range[0])
            
            # Add price and owners to average
            if not is_eval_dataset:
                avg_price += float(row["Price"])                
            avg_owners += row["Estimated owners"]
            
    # Calculate average price and owners
    avg_price = int(avg_price / len(dataset))
    avg_owners = int(avg_owners / len(dataset))
            
    # Create one-hot encoding for genres and merge with original row
    for row in dataset:
        genres = row["Genres"].split(",") if row["Genres"] else []
        for genre in unique_genres:
            row[genre] = 1 if genre in genres else 0
        row.pop("Genres")

    
    target = "Price"
    # Separate features (X) and target (y)
    X = []
    y = []    
    
    for row in dataset:
        features = {} 
        for key in row:
            if key != target:
                features[key] = row[key]
        X.append(features)
        y.append(row[target] if not is_eval_dataset else 0)

    # Return the output tuple        
    return X, y, avg_price, avg_owners, unique_genres

# Load the full dataset
X, y, avg_price, avg_owners, unique_genres = load_data("../dataset_train.csv")

# Output to a file named output_1.csv the number of samples, the average price and the average owners
with open("output_1.csv", "w") as file:
    # Write the header with the required columns: number of samples, Average Price, Average Owners, and number of unique Genres
    file.write("Samples,Average Price,Average Owners,Unique Genres\n")
    # Write the data
    file.write(f"{len(X)},{avg_price},{avg_owners},{len(unique_genres)}\n")


### Train your model - Play as you like from here to get the best AI model ####

In [119]:
# Convert the dataset to a pandas DataFrame for easier manipulation in other libraries
X = pd.DataFrame(X)
y = pd.Series(y).ravel()

# Returns the model trained on the given features and target variables
def train_model(X: pd.DataFrame, y: pd.DataFrame) -> Any:
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define the model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Make predictions and evaluate the model on your test set
    y_pred = model.predict(X_test)

    # Calculate the mean absolute error
    mae = mean_absolute_error(y_test, y_pred)
    print("Mean Absolute Error:", mae)
    
    return model

# Call model training and saving. This will save the model to a file named trained_model.pkl
trained_model = train_model(X, y)
joblib.dump(trained_model, "Output_CandidatX/trained_model.pkl")



Mean Absolute Error: 6.6351274999999985


['trained_model.pkl']

### Task 2: Load the model and the evaluation dataset, and make predictions, and output to a file named output_1.csv

In [120]:
def predict_prices(trained_model: Any, dataset_path: str) -> pd.DataFrame:
    # Load the model
    model = joblib.load(trained_model)
    
    # Load the evaluation dataset
    X_eval, _, _, _, _ = load_data(dataset_path, is_eval_dataset=True, known_genres=unique_genres)
    
    X_eval = pd.DataFrame(X_eval)
    
    # Make predictions
    y_pred = model.predict(X_eval)
    
    # Save the predictions to a file named output_2.csv with a single column of predictions
    # no pandas 
    with open("output_2.csv", "w") as file:
        # Write the header
        file.write("Price\n")
        # Write the predictions
        for pred in y_pred:
            file.write(str(pred) + "\n")
    
    
trained_model = joblib.load("Output_CandidatX/trained_model.pkl")
predict_prices("trained_model.pkl", "../dataset_eval.csv")
