# Merge CSVs

This notebook merges the CSV files in the BASEDIR folder.

## Imports

In [3]:
import pandas as pd
import os

## Functions

In [4]:
def get_year(string):
    if not isinstance(string, str):
        return '9999'
    else:
        return string.split('-')[0]

def get_month(string):
    if not isinstance(string, str):
        return '99'
    if len(string.split('-'))>=2:
        return string.split('-')[1]
    else:
        return '99'

def get_day(string):
    if not isinstance(string, str):
        return '99'
    if len(string.split('-'))>=3:
        return string.split('-')[2]
    else:
        return '99'


## Constants

In [12]:
BASEDIR = 'data/processed' # Path to dir of CSV files to merge
OUTFILE = 'data/proc_Albums.csv' # Path to output file

## Merging

In [17]:
dfs = []
for f in os.listdir(BASEDIR):
    dfs.append(pd.read_csv(os.path.join(BASEDIR,f)))

result_df = pd.concat(dfs)

# Renaming of some columns
result_df = result_df.rename({'name': 'lastfm_album', 'artist': 'lastfm_artist'}, axis=1) 

# Setting index, sorting, dropping unnecessary columns
result_df = result_df.drop(['Unnamed: 0'], axis=1)
result_df = result_df.set_index(['Band', 'Album']).sort_values('playcount', ascending=False).drop(['0', 'image'], axis=1)

# Process release dates
result_df['year'] = result_df.apply(lambda row: get_year(row['release-date']), axis=1)
result_df['month'] = result_df.apply(lambda row: get_month(row['release-date']), axis=1)
result_df['day'] = result_df.apply(lambda row: get_day(row['release-date']), axis=1)
result_df = result_df.drop(['release-date'], axis=1)

# Preview DataFrame
result_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Nr.,Jahr,Monat,Tag,MA,lastfm_artist,ignored tags,listeners,mbid,lastfm_album,playcount,tags,url,year,month,day
Band,Album,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
System of a Down,Toxicity,7936,2001,9.0,4,,System of a Down,"['albums i own', 'metal', 'rock']",2384094.0,b8e92589-8b7c-4d0a-9986-02d129997e04,Toxicity,64877945.0,"['alternative metal', 'nu metal']",https://www.last.fm/music/System+of+a+Down/Tox...,2001,08,27
Linkin Park,Meteora,4976,2003,3.0,25,,Linkin Park,"['albums i own', 'rock', 'linkin park', 'alter...",2162873.0,f3bfd870-0708-46d0-9953-9f5f573fb600,Meteora,57517769.0,['nu metal'],https://www.last.fm/music/Linkin+Park/Meteora,2003,03,24
Linkin Park,Hybrid Theory,4975,2000,10.0,24,,Linkin Park,"['albums i own', 'rock', 'linkin park', 'alter...",1915887.0,f0cd4041-f859-4b97-b563-3b5f33f98d9d,Hybrid Theory,51008344.0,['nu metal'],https://www.last.fm/music/Linkin+Park/Hybrid+T...,2000,05,07
Linkin Park,Minutes to Midnight,4977,2007,5.0,14,,Linkin Park,"['albums i own', 'rock', 'alternative rock', '...",1638960.0,d08a98b7-0c33-40d6-b574-ff0ce1600af7,Minutes to Midnight,46298990.0,['nu metal'],https://www.last.fm/music/Linkin+Park/Minutes+...,2007,05,11
Nirvana,Nevermind,5942,1991,9.0,24,,Nirvana,"['albums i own', 'rock', '90s', 'alternative']",2440409.0,f7930de3-081a-452a-bafe-f7285a2abb6d,Nevermind,44981006.0,['grunge'],https://www.last.fm/music/Nirvana/Nevermind,1991,09,23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ensiferum,Thalassic,3081,,,,6.67,,,,,,,,,9999,99,99
Ensiferum,Two Paths,3082,,,,3.65,,,,,,,,,9999,99,99
Enslaved,E,3088,,,,,,,,,,,,,9999,99,99
Enslaved,In Times,3092,,,,10.73,,,,,,,,,9999,99,99


In [24]:
result_df = result_df[['lastfm_artist', 'lastfm_album', 'Jahr', 'Monat', 'Tag', 'year', 'month','day', 'listeners', 'playcount', 'tags', 'ignored tags', 'mbid', 'url']]

In [25]:
result_df.to_csv(OUTFILE)