# Movies

### Leitura do ficheiro json

In [1]:
import json
import pandas as pd
import csv

# Ler o arquivo JSON
with open('movies.json', 'r+') as file:
    data = json.load(file)

    # Percorrer as chaves do dicionário
    for movie_title, movie_data in data.items():
        # Criar uma nova entrada no dicionário com a coluna "label" e valor "None" como string
        movie = movie_data['data']
        if movie['movie'] != None and movie['movie'].get('label')==None: #Apenas coloca None se não tiver nenhum valor label
            movie['movie']['label'] = "None"

    # Voltar para o início do arquivo
    file.seek(0)

    # Escrever o arquivo JSON atualizado
    json.dump(data, file, indent=4)

    # Truncar o restante do arquivo, caso o novo conteúdo seja menor que o original
    file.truncate()

### Conversão de json em csv

In [2]:
with open('movies.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Title', 'Type','CriticScore', 'Label/Studio', 'ReleaseYear', 'ReleaseMonth', 'Director', 'Genre', 'Rating', 'Runtime', 'NumOfCriticReviews'])
    
    for movie_title, movie_data in data.items():
        movie = movie_data['data']
        if((movie.get('movie'))):
            genre = "/".join([genre.strip() for genre in movie['movie']['genres']])
            director = "/".join([genre.strip() for genre in movie['movie']['director']])
            #Atribui a classificação consoante o CriticScore
            if movie['movie']['criticScore'] >= 90:
                score = "Excellent"
            elif movie['movie']['criticScore'] >= 70:
                score = "Good"
            elif movie['movie']['criticScore'] >= 40:
                score = "Mediocre"
            else:
                score = "Bad"
            date = movie['movie']['releaseDate'].split()
            month = date[0]
            writer.writerow([
                movie['movie']['title'],
                "Movie",
                score,
                movie['movie']['label'],
                movie['movie']['year'],
                month,
                director,
                genre,
                movie['movie']['rating'],
                movie['movie']['runtime'],
                movie['movie']['numOfCriticReviews']
            ])


In [3]:
pd.read_csv("movies.csv")

Unnamed: 0,Title,Type,CriticScore,Label/Studio,ReleaseYear,ReleaseMonth,Director,Genre,Rating,Runtime,NumOfCriticReviews
0,The Godfather,Movie,Bad,Paramount Pictures,1972,March,Francis Ford Coppola,Crime,R,175 min,16
1,The Shawshank Redemption,Movie,Good,,1994,September,Frank Darabont,Crime,R,142 min,21
2,The Dark Knight,Movie,Good,Warner Bros.,2008,July,Christopher Nolan,Crime,PG-13,152 min,39
3,The Lord of the Rings: The Fellowship of the Ring,Movie,Excellent,,2001,December,Peter Jackson,Fantasy,PG-13,178 min,34
4,Pulp Fiction,Movie,Excellent,Miramax Films,1994,October,Quentin Tarantino,Crime,R,154 min,24
...,...,...,...,...,...,...,...,...,...,...,...
105,The Spectacular Now,Movie,Good,A24,2013,August,James Ponsoldt,Romance,R,95 min,42
106,Safety Not Guaranteed,Movie,Good,FilmDistrict,2012,June,Colin Trevorrow,Comedy,R,86 min,31
107,Like Crazy,Movie,Mediocre,Paramount Vantage,2011,October,Drake Doremus,Romance,PG-13,90 min,38
108,The Art of Getting By,Movie,Bad,Fox Searchlight Pictures,2011,June,Gavin Wiesen,Romance,PG-13,84 min,28


# Tv-Shows

### Conversão de txt em json

In [4]:
import requests
import re
import json

# Ler o arquivo TXT
shows_file = open('tv-shows.txt', 'r')
shows_lines = shows_file.readlines()

shows = {}
elementos = []

for show_line in shows_lines:
    if (show_line[0] != '"'): #Linhas com a estrutura: Título (ano_inicio-ano_fim, network, genero, estudio, rating, adaptação), pontuação
        try:
            
            info, critic_score = show_line.rsplit(", ", 1)
            critic_score = critic_score.strip()
            info = info.split(", ")

            titulo, ano = info[0].split("(")
            network = info[1].strip()

            titulo = titulo.strip()
            ano = ano.strip(")")

            adaptation = "Original"
            if info[-1].strip().endswith("Adaptation)"):
                generos = [genero.strip() for genero in info[2:-3]]
                produtora = info[-3].strip()
                classificacao = info[-2].strip()
                if info[-1].strip().endswith("Book Adaptation)"):
                    adaptation = "Based on a book"
                if info[-1].strip().endswith("Comic Book Adaptation)"):
                    adaptation = "Based on a comic book"
                if info[-1].strip().endswith("Film Adaptation)"):
                    adaptation = "Based on a movie"
            else:
                generos = [genero.strip() for genero in info[2:-2]]
                produtora = info[-2].strip()
                classificacao = info[-1].strip()
                classificacao = classificacao.strip(")")
        except IndexError:
            continue
        nova_string = f'"{titulo}" ({ano}) - {network} - {produtora} - {", ".join(generos)} - {classificacao} - {adaptation} - {critic_score}'
        
    else: #Linhas com a estrutura: "Título" (ano) - estudio - genero - rating - adaptação - pontuação
        info = show_line.split(" - ")
        titulo_ano = info[0]
        produtora = info[1]
        genero = info[2]
        classificacao = info[3]
        adaptation = info[4].strip()
        if len(info)==6:
            critic_score = info[5]
        else:
            critic_score = 'None'
        if adaptation == "Not a book adaptation":
            adaptation = "Original"
        titulo, ano = titulo_ano[1:-1].split(" (")
        network = "None"

        nova_string = f'"{titulo}" ({ano}) - {network} - {produtora} - {genero} - {classificacao} - {adaptation} - {critic_score}'
    elementos.append(nova_string)

for show_line in elementos:
    lista = show_line.split(" - ")
    titulo_ano = lista[0]
    titulo, ano = titulo_ano.split(" (")
    titulo = titulo.replace('"', '')
    ano = ano.strip(")").split("-")
    try:
        # cria um dicionário com as informações obtidas
        show = {
            "title": titulo,
            "year": "-".join(ano),
            "network": lista[1],
            "studio": lista[2],
            "genre": lista[3],
            "rating": lista[4],
            "adaptation": lista[5],
            "critic_score": lista[6]
        }
        
        if titulo not in shows:
            # adiciona o dicionário ao dicionário principal
            shows[titulo] = {"data": {"tv-show": show}}
    except IndexError:
        continue

# Cria e escrever em arquivo JSON
with open('tv-shows.json', 'w') as output_file:
    output_file.write(json.dumps(shows))

### Conversão de json em csv

In [5]:
import json
import pandas as pd
import csv


# Ler o arquivo JSON
with open('tv-shows.json', 'r') as file:
    data = json.load(file)

# Cria e escreve em csv
with open('tv-shows.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Title', 'Type', 'ReleaseYear', 'FinalYear', 'Network', 'Label/Studio', 'Genre', 'Rating', 'Adaptation', 'CriticScore'])

    for show_title, show_data in data.items():
        show = show_data['data']
        year = show['tv-show']['year'].split('-')
        final = 'None'
        score = show['tv-show']['critic_score']
        #Atribui a classificação consoante o score
        if score != 'None':
            score = float(score)
            if score >= 9:
                score = "Excellent"
            elif score >= 8:
                score = "Good"
            elif score > 5:
                score = "Mediocre"
            else:
                score = "Bad"
        if (len(year)>1):
            final = year[1]
        release = year[0]
        if((show.get('tv-show'))):
            writer.writerow([
                show['tv-show']['title'],
                "Tv-Show",
                release,
                final,
                show['tv-show']['network'],
                show['tv-show']['studio'],
                show['tv-show']['genre'],
                show['tv-show']['rating'],
                show['tv-show']['adaptation'],
                score
            ])

In [6]:
pd.read_csv("tv-shows.csv")

Unnamed: 0,Title,Type,ReleaseYear,FinalYear,Network,Label/Studio,Genre,Rating,Adaptation,CriticScore
0,Game of Thrones,Tv-Show,2011,2019,HBO,Warner Bros.,"Drama, Fantasy",TV-MA,Based on a book,Excellent
1,Breaking Bad,Tv-Show,2008,2013,AMC,Sony Pictures Television,Drama,TV-MA,Original,Excellent
2,Friends,Tv-Show,1994,2004,NBC,Warner Bros.,Comedy,TV-PG,Original,Good
3,The Sopranos,Tv-Show,1999,2007,HBO,Warner Bros.,Drama,TV-MA,Original,Excellent
4,Stranger Things,Tv-Show,2016,2022,Netflix,,"Horror, Science Fiction",,Based on a book,Good
...,...,...,...,...,...,...,...,...,...,...
237,Indebted,Tv-Show,2020,,,Sony Pictures Television,Comedy,TV-14,Original,Bad
238,The Baker and the Beauty,Tv-Show,2020,,,Universal Television,"Comedy, Drama",TV-14,Based on a TV show,Mediocre
239,Council of Dads,Tv-Show,2020,,,Jerry Bruckheimer Television,Drama,TV-14,Based on a book,Mediocre
240,The Secret of Skinwalker Ranch,Tv-Show,2020,,,Painless Productions,Reality-TV,TV-14,Original,Mediocre


# Albums

### Conversão de txt em json

In [7]:
import requests
import re
import json

# Ler o arquivo TXT
albums_file = open('albums.txt', 'r')
albums_lines = albums_file.readlines()

albums = {}

for album_line in albums_lines:
    elements = [e.strip() for e in album_line.split(" - ")]
    try:
        artist, genre = elements[1].split("(")[0].strip(), elements[1].split("(")[1].replace(")", "").strip()
        # cria um dicionário com as informações obtidas
        album = {
            "title": elements[0],
            "artist": artist,
            "genre": genre,
            "year": int(elements[2]),
            "label": elements[3],
            "sales": elements[4]
        }

        albums[elements[0]] = {"data": {"album": album}}

    except IndexError:
        continue

# Cria e escrever em arquivo JSON
with open('albums.json', 'w') as output_file:
    output_file.write(json.dumps(albums))

### Conversão de json em csv

In [8]:
import json
import pandas as pd
import csv

# Ler o arquivo JSON
with open('albums.json', 'r') as file:
    data = json.load(file)

# Converte as vendas em números
def convert_sales(sales):
    sales = sales.replace(',', '')
    if 'million' in sales:
        sales = float (sales.replace('million', ''))
        sales = int(sales * 1000000)
    else:
        sales = int(sales)
    return sales

# Cria e escreve em csv
with open('albums.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Title', 'Type', 'Artist', 'Genre', 'ReleaseYear', 'Label/Studio', 'CriticScore'])
    for album_title, album_data in data.items():
        album = album_data['data']
        sales = convert_sales(album['album']['sales'])
        #Atribui a classificação consoante as vendas
        if sales >= 40000000:
            score = "Excellent"
        elif sales >= 10000000:
            score = "Good"
        elif sales >= 1000000:
            score = "Mediocre"
        else:
            score = "Bad"
        if((album.get('album'))):
            writer.writerow([
                album['album']['title'],
                "Album",
                album['album']['artist'],
                album['album']['genre'],
                album['album']['year'],
                album['album']['label'],
                score
            ])

In [9]:
pd.read_csv("albums.csv")

Unnamed: 0,Title,Type,Artist,Genre,ReleaseYear,Label/Studio,CriticScore
0,Thriller,Album,Michael Jackson,Pop/R&B,1982,Epic Records,Excellent
1,The Dark Side of the Moon,Album,Pink Floyd,Progressive Rock,1973,Harvest Records,Excellent
2,Back in Black,Album,AC/DC,Hard Rock,1980,Atlantic Records,Excellent
3,The Bodyguard Soundtrack,Album,Various Artists,Soundtrack/Pop,1992,Arista Records,Excellent
4,Bat Out of Hell,Album,Meat Loaf,Rock,1977,Epic Records,Excellent
...,...,...,...,...,...,...,...
96,You Forgot It in People,Album,Broken Social Scene,Indie Rock,2002,Arts & Crafts,Bad
97,Let It Come Down,Album,Spiritualized,Space Rock,2001,Arista Records,Bad
98,The Argument,Album,Fugazi,Post-Hardcore,2001,Dischord Records,Bad
99,Person Pitch,Album,Panda Bear,Experimental Pop,2007,Paw Tracks,Bad


# Merge

### Juntar os 3 datasets

In [10]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# carrega data sets
movies_df = pd.read_csv("movies.csv")
albums_df = pd.read_csv("albums.csv")
tv_shows_df = pd.read_csv("tv-shows.csv")

# merge dos data sets
merged_df1 = pd.merge(tv_shows_df, albums_df, on=["Title", "ReleaseYear", "Genre", "Type", "CriticScore", "Label/Studio"], how='outer')
merged_df2 = pd.merge(tv_shows_df, movies_df, on=["Title", "ReleaseYear", "Genre", "Type", "CriticScore", "Label/Studio", "Rating"], how='outer')
merged_df3 = pd.merge(albums_df, movies_df, on=["Title", "ReleaseYear", "Genre", "Type", "CriticScore", "Label/Studio"], how='outer')
merged_df = pd.concat([merged_df1, merged_df2, merged_df3], axis=0)


# Reorganiza as colunas (coloca CriticScore no fim)
columns = merged_df.columns.tolist()
columns.remove("CriticScore")
columns.append("CriticScore")
merged_df = merged_df[columns]

# escreve os data sets num ficheiro csv
merged_df.fillna("None", inplace=True)
merged_df.to_csv("merged_data.csv", index=False)


In [11]:
df = pd.read_csv("merged_data.csv")
df

Unnamed: 0,Title,Type,ReleaseYear,FinalYear,Network,Label/Studio,Genre,Rating,Adaptation,Artist,ReleaseMonth,Director,Runtime,NumOfCriticReviews,CriticScore
0,Game of Thrones,Tv-Show,2011,2019,HBO,Warner Bros.,"Drama, Fantasy",TV-MA,Based on a book,,,,,,Excellent
1,Breaking Bad,Tv-Show,2008,2013,AMC,Sony Pictures Television,Drama,TV-MA,Original,,,,,,Excellent
2,Friends,Tv-Show,1994,2004,NBC,Warner Bros.,Comedy,TV-PG,Original,,,,,,Good
3,The Sopranos,Tv-Show,1999,2007,HBO,Warner Bros.,Drama,TV-MA,Original,,,,,,Excellent
4,Stranger Things,Tv-Show,2016,2022,Netflix,,"Horror, Science Fiction",,Based on a book,,,,,,Good
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
901,The Spectacular Now,Movie,2013,,,A24,Romance,R,,,August,James Ponsoldt,95 min,42.0,Good
902,Safety Not Guaranteed,Movie,2012,,,FilmDistrict,Comedy,R,,,June,Colin Trevorrow,86 min,31.0,Good
903,Like Crazy,Movie,2011,,,Paramount Vantage,Romance,PG-13,,,October,Drake Doremus,90 min,38.0,Mediocre
904,The Art of Getting By,Movie,2011,,,Fox Searchlight Pictures,Romance,PG-13,,,June,Gavin Wiesen,84 min,28.0,Bad
