In [47]:
import pandas as pd
import os
import json
import re
import requests
from pathlib import Path

from tqdm import tqdm
# import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, ArrayType, StringType, IntegerType, FloatType

from pyspark.ml import Pipeline

In [68]:
# !git clone https://github.com/meralegre/Big_Data_IMDb.git
# %cd Big_Data_IMDb/
# %pwd

### Load data with Spark

In [48]:
spark = SparkSession.builder \
    .master("local") \
    .config("spark.driver.bindAddress","127.0.0.1") \
    .getOrCreate()

# spark = SparkSession.builder.appName("BERT-FineTuning").getOrCreate()

In [53]:
def load_train_data():
    path = "data/train/"
    train_files = os.listdir(path=path)

    train_df = pd.DataFrame()
    for file in train_files:
        if file.startswith('train-') and file.endswith('.csv'):
            df = pd.read_csv(f"{path}/{file}")
            train_df = pd.concat([train_df, df], ignore_index=False)
            train_df = train_df.drop(columns="Unnamed: 0")

    # train_df = train_df.sort_index()
    spark_train_df = spark.createDataFrame(train_df).replace(to_replace='\\N', value=None)
    return spark_train_df

In [52]:
def load_validation_data():
    path = "data/"
    validation_dfdf = pd.read_csv(f"{path}/validation_hidden.csv", index_col=[0])
    #validation_df = validation_df.sort_index()
    validation_df = validation_df.drop(columns="Unnamed: 0")
    spark_validation_df = spark.createDataFrame(validation_df).replace(to_replace='\\N', value=None)
    return spark_validation_df

def load_test_data():
    path = "data/"
    test_df = pd.read_csv(f"{path}/test_hidden.csv", index_col=[0])
    # test_df = test_df.sort_index()
    test_df = test_df.drop(columns="Unnamed: 0")
    spark_test_df = spark.createDataFrame(test_df).replace(to_replace='\\N', value=None)
    return spark_test_df


In [69]:
train_data = load_train_data()
train_data.show()

+---------+--------------------+--------------------+---------+-------+--------------+--------+-----+
|   tconst|        primaryTitle|       originalTitle|startYear|endYear|runtimeMinutes|numVotes|label|
+---------+--------------------+--------------------+---------+-------+--------------+--------+-----+
|tt0011439|   The Mark of Zorro|   The Mark of Zorro|     1920|   NULL|            79|  2439.0| true|
|tt0012532|Ớrpháns ớf thé Stớrm|                 NaN|     1921|   NULL|           150|     NaN| true|
|tt0013933|  The Faithful Heart|        Coeur fidèle|     1923|   NULL|            87|  1252.0| true|
|tt0015400| The Thief of Bagdad|                 NaN|     1924|   NULL|           155|  6001.0| true|
|tt0015842|  The Joyless Street|                 NaN|     1925|   NULL|           125|  1554.0| true|
|tt0016544|    The Wizard of Oz|                 NaN|     1925|   NULL|            95|  1497.0|false|
|tt0016641|Ben-Hur: A Tale o...|Ben-Hur: A Tale o...|     1925|   NULL|           