In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'ctrlshiftintelligence-2024:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F74740%2F8162824%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240412%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240412T080452Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D92536aabb03280e48c0e6fc53cf048a479ef03e8dcb040201cf137053e90dad5c63333ac2c8d314072af3e0727a12f342f07084d32608f58623f447571cc417cdec6f6b3160e46af32eddf2ead420bec2238ce80360285cc8c4797bf8874ee01afebce575e8edce777d8f404cd73428b7825c61906992de44ddc7574d72bcd3014e3d2c9e3a99cdbc14e9c177e79d5fca1ebd32ef923754c29010a82480827f155306342eda0cec085b8ee3d947edc6bc2a8100c9eaf8de6637c4d902b8f9e0f2aacb4c125a973249000e859f7f1e5311ef5c17fb9dd53072b2c83a5d3bd0c22d058f1ae89c427a2f911b809276e9caedbc5c196922dacc645c231a1e6c00fc9'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading ctrlshiftintelligence-2024, 5391248 bytes compressed
Downloaded and uncompressed: ctrlshiftintelligence-2024
Data source import complete.


In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ctrlshiftintelligence-2024/train.csv
/kaggle/input/ctrlshiftintelligence-2024/test.csv


In [3]:
import pandas as pd
import io
import numpy as np

In [4]:
df = pd.read_csv('/kaggle/input/ctrlshiftintelligence-2024/train.csv')
df

Unnamed: 0.1,Unnamed: 0,track_id,track_name,track_artist,lyrics,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,...,track_mode,speech_factor,track_acousticness,track_instrumentalness,track_liveness,track_valence,track_tempo,duration_track_ms,track_language,target
0,0,004s3t0ONYlzxII9PLgU6z,I Feel Alive,Steady Rollin,"The trees, are singing in the wind The sky blu...",3z04Lb9Dsilqw68SHt6jLB,Love & Loss,2017-11-21,Hard Rock Workout,3YouF0u7waJnolytf9JCXf,...,1,0.0442,0.01170,0.009940,0.3470,0.404,135.225,373512,en,28
1,1,00chLpzhgVjxs1zKC9UScL,Poison,Bell Biv DeVoe,"NA Yeah, Spyderman and Freeze in full effect U...",6oZ6brjB8x3GoeSYdwJdPc,Gold,2005-01-01,"Back in the day - R&B, New Jack Swing, Swingbe...",3a9y4eeCJRmG9p4YKfqYIx,...,0,0.2160,0.00432,0.007230,0.4890,0.650,111.904,262467,en,0
2,2,00cqd6ZsSkLZqGMlQCR0Zo,Baby It's Cold Outside (feat. Christina Aguilera),CeeLo Green,I really can't stay Baby it's cold outside I'v...,3ssspRe42CXkhPxdc12xcp,CeeLo's Magic Moment,2012-10-29,Christmas Soul,6FZYc2BvF7tColxO8PBShV,...,0,0.0341,0.68900,0.000000,0.0664,0.405,118.593,243067,en,0
3,3,00emjlCv9azBN0fzuuyLqy,Dumb Litty,KARD,Get up out of my business You don't keep me fr...,7h5X3xhh3peIK9Y0qI5hbK,KARD 2nd Digital Single ‘Dumb Litty’,2019-09-22,K-Party Dance Mix,37i9dQZF1DX4RDXswvP6Mj,...,1,0.0409,0.03700,0.000000,0.1380,0.240,130.018,193160,en,65
4,4,00f9VGHfQhAHMCQ2bSjg3D,Soldier,James TW,"Hold your breath, don't look down, keep trying...",3GNzXsFbzdwM0WKCZtgeNP,Chapters,2019-04-26,urban contemporary,4WiB26kw0INKwbzfb5M6Tv,...,1,0.0550,0.28000,0.000000,0.0975,0.305,147.764,224720,en,70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,5995,31tcr4OcLi2TXE5u4Iwhjc,Right Down to It,Damian Dame,"Mmm..... Baby, if I drive you crazy Ain't noth...",2eSIbUfDDE6JV8bpFgsb5M,Damian Dame,1991,Kenny B's Ultimate New Jack Swing Mix,7bdhfXlbfml9CuNn5c7372,...,0,0.0281,0.10400,0.000000,0.0755,0.818,91.914,294267,en,18
5996,5996,31Tp5iKrVUayn65qhg06nE,She,Elvis Costello,She may be the face I can't forget A trace of ...,2VNMoSXYKrjAAU4aqiTvtY,In Motion Pictures,2012,The Sound of Permanent Wave,4EYSGTuqe9cVfSVpX4gtGv,...,1,0.0320,0.42700,0.000002,0.1300,0.276,130.876,187720,en,64
5997,5997,31u6rUeIEXGrYVoh10U7eu,Leave It In My Dreams,The Voidz,"Darling, darlin', I'm falling, oh I'm your man...",2dMmcXlG8xtRJNlsjIrPWe,Virtue,2018-03-30,permanent wave,3uFyGoayrP71xS6T6Y8Bh2,...,0,0.0355,0.02260,0.011900,0.1090,0.614,118.551,239442,en,58
5998,5998,31ue6kAOfrqVrdoPdfkq40,Adaptation,The Weeknd,"When the sun comes up, you're searching for a ...",2FgMWuwMeTgJArP2RF3upF,Kiss Land,2013-01-01,Urban Contemporary,4Pbs84EQbuAblxlp6Chz0d,...,0,0.0681,0.15100,0.000583,0.3050,0.613,114.960,283933,en,44


In [5]:
import re
df['lyrics'] = df['lyrics'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

print(df)

In [6]:
import re
df['track_name'] = df['track_name'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

print(df)

In [7]:
import re
df['track_artist'] = df['track_artist'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

print(df)

In [8]:
import re
df['track_album_name'] = df['track_album_name'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

print(df)

In [9]:
import re
df['playlist_name'] = df['playlist_name'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

print(df)

In [10]:
df

Unnamed: 0.1,Unnamed: 0,track_id,track_name,track_artist,lyrics,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,...,track_mode,speech_factor,track_acousticness,track_instrumentalness,track_liveness,track_valence,track_tempo,duration_track_ms,track_language,target
0,0,004s3t0ONYlzxII9PLgU6z,I Feel Alive,Steady Rollin,"The trees, are singing in the wind The sky blu...",3z04Lb9Dsilqw68SHt6jLB,Love & Loss,2017-11-21,Hard Rock Workout,3YouF0u7waJnolytf9JCXf,...,1,0.0442,0.01170,0.009940,0.3470,0.404,135.225,373512,en,28
1,1,00chLpzhgVjxs1zKC9UScL,Poison,Bell Biv DeVoe,"NA Yeah, Spyderman and Freeze in full effect U...",6oZ6brjB8x3GoeSYdwJdPc,Gold,2005-01-01,"Back in the day - R&B, New Jack Swing, Swingbe...",3a9y4eeCJRmG9p4YKfqYIx,...,0,0.2160,0.00432,0.007230,0.4890,0.650,111.904,262467,en,0
2,2,00cqd6ZsSkLZqGMlQCR0Zo,Baby It's Cold Outside (feat. Christina Aguilera),CeeLo Green,I really can't stay Baby it's cold outside I'v...,3ssspRe42CXkhPxdc12xcp,CeeLo's Magic Moment,2012-10-29,Christmas Soul,6FZYc2BvF7tColxO8PBShV,...,0,0.0341,0.68900,0.000000,0.0664,0.405,118.593,243067,en,0
3,3,00emjlCv9azBN0fzuuyLqy,Dumb Litty,KARD,Get up out of my business You don't keep me fr...,7h5X3xhh3peIK9Y0qI5hbK,KARD 2nd Digital Single ‘Dumb Litty’,2019-09-22,K-Party Dance Mix,37i9dQZF1DX4RDXswvP6Mj,...,1,0.0409,0.03700,0.000000,0.1380,0.240,130.018,193160,en,65
4,4,00f9VGHfQhAHMCQ2bSjg3D,Soldier,James TW,"Hold your breath, don't look down, keep trying...",3GNzXsFbzdwM0WKCZtgeNP,Chapters,2019-04-26,urban contemporary,4WiB26kw0INKwbzfb5M6Tv,...,1,0.0550,0.28000,0.000000,0.0975,0.305,147.764,224720,en,70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,5995,31tcr4OcLi2TXE5u4Iwhjc,Right Down to It,Damian Dame,"Mmm..... Baby, if I drive you crazy Ain't noth...",2eSIbUfDDE6JV8bpFgsb5M,Damian Dame,1991,Kenny B's Ultimate New Jack Swing Mix,7bdhfXlbfml9CuNn5c7372,...,0,0.0281,0.10400,0.000000,0.0755,0.818,91.914,294267,en,18
5996,5996,31Tp5iKrVUayn65qhg06nE,She,Elvis Costello,She may be the face I can't forget A trace of ...,2VNMoSXYKrjAAU4aqiTvtY,In Motion Pictures,2012,The Sound of Permanent Wave,4EYSGTuqe9cVfSVpX4gtGv,...,1,0.0320,0.42700,0.000002,0.1300,0.276,130.876,187720,en,64
5997,5997,31u6rUeIEXGrYVoh10U7eu,Leave It In My Dreams,The Voidz,"Darling, darlin', I'm falling, oh I'm your man...",2dMmcXlG8xtRJNlsjIrPWe,Virtue,2018-03-30,permanent wave,3uFyGoayrP71xS6T6Y8Bh2,...,0,0.0355,0.02260,0.011900,0.1090,0.614,118.551,239442,en,58
5998,5998,31ue6kAOfrqVrdoPdfkq40,Adaptation,The Weeknd,"When the sun comes up, you're searching for a ...",2FgMWuwMeTgJArP2RF3upF,Kiss Land,2013-01-01,Urban Contemporary,4Pbs84EQbuAblxlp6Chz0d,...,0,0.0681,0.15100,0.000583,0.3050,0.613,114.960,283933,en,44


In [11]:
y = df['target']
y

0       28
1        0
2        0
3       65
4       70
        ..
5995    18
5996    64
5997    58
5998    44
5999    56
Name: target, Length: 6000, dtype: int64

In [12]:
columns_to_drop = df.columns[[0, 1, 5,  7,9,19,24,25]]
X = df.drop(columns_to_drop, axis=1)
X

Unnamed: 0,track_name,track_artist,lyrics,track_album_name,playlist_name,playlist_genre,playlist_subgenre,dance_factor,track_energy,track_key,track_loudness,track_mode,speech_factor,track_acousticness,track_liveness,track_valence,track_tempo,duration_track_ms
0,I Feel Alive,Steady Rollin,"The trees, are singing in the wind The sky blu...",Love & Loss,Hard Rock Workout,rock,hard rock,0.303,0.880,9,-4.739,1,0.0442,0.01170,0.3470,0.404,135.225,373512
1,Poison,Bell Biv DeVoe,"NA Yeah, Spyderman and Freeze in full effect U...",Gold,"Back in the day - R&B, New Jack Swing, Swingbe...",r&b,new jack swing,0.845,0.652,6,-7.504,0,0.2160,0.00432,0.4890,0.650,111.904,262467
2,Baby It's Cold Outside (feat. Christina Aguilera),CeeLo Green,I really can't stay Baby it's cold outside I'v...,CeeLo's Magic Moment,Christmas Soul,r&b,neo soul,0.425,0.378,5,-5.819,0,0.0341,0.68900,0.0664,0.405,118.593,243067
3,Dumb Litty,KARD,Get up out of my business You don't keep me fr...,KARD 2nd Digital Single ‘Dumb Litty’,K-Party Dance Mix,pop,dance pop,0.760,0.887,9,-1.993,1,0.0409,0.03700,0.1380,0.240,130.018,193160
4,Soldier,James TW,"Hold your breath, don't look down, keep trying...",Chapters,urban contemporary,r&b,urban contemporary,0.496,0.639,6,-6.157,1,0.0550,0.28000,0.0975,0.305,147.764,224720
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,Right Down to It,Damian Dame,"Mmm..... Baby, if I drive you crazy Ain't noth...",Damian Dame,Kenny B's Ultimate New Jack Swing Mix,r&b,new jack swing,0.743,0.584,1,-11.274,0,0.0281,0.10400,0.0755,0.818,91.914,294267
5996,She,Elvis Costello,She may be the face I can't forget A trace of ...,In Motion Pictures,The Sound of Permanent Wave,rock,permanent wave,0.333,0.280,1,-12.870,1,0.0320,0.42700,0.1300,0.276,130.876,187720
5997,Leave It In My Dreams,The Voidz,"Darling, darlin', I'm falling, oh I'm your man...",Virtue,permanent wave,rock,permanent wave,0.600,0.746,4,-4.696,0,0.0355,0.02260,0.1090,0.614,118.551,239442
5998,Adaptation,The Weeknd,"When the sun comes up, you're searching for a ...",Kiss Land,Urban Contemporary,r&b,urban contemporary,0.642,0.790,11,-6.800,0,0.0681,0.15100,0.3050,0.613,114.960,283933


In [None]:
pip install vaderSentiment


In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
analyzer = SentimentIntensityAnalyzer()

In [None]:
def analyze_sentiment(text):
    scores = analyzer.polarity_scores(text)
    return scores

In [None]:
df['sentiment'] = df['lyrics'].apply(analyze_sentiment)

In [None]:
# Separate 'neg', 'neu', and 'pos' values from each cell in 'sentiment_scores' column
df['neg'] = df['sentiment'].apply(lambda x: x['neg'])
df['neu'] = df['sentiment'].apply(lambda x: x['neu'])
df['pos'] = df['sentiment'].apply(lambda x: x['pos'])
df['tot'] = df['sentiment'].apply(lambda x:x['compound'])
# Drop the original 'sentiment_scores' column
df.drop('sentiment', axis=1, inplace=True)

In [None]:
df['sentiment'] = df['track_name'].apply(analyze_sentiment)

In [None]:
# Separate 'neg', 'neu', and 'pos' values from each cell in 'sentiment_scores' column
df['neg1'] = df['sentiment'].apply(lambda x: x['neg'])
df['neu1'] = df['sentiment'].apply(lambda x: x['neu'])
df['pos1'] = df['sentiment'].apply(lambda x: x['pos'])
df['tot1'] = df['sentiment'].apply(lambda x:x['compound'])

# Drop the original 'sentiment_scores' column
df.drop('sentiment', axis=1, inplace=True)

In [None]:
df['sentiment'] = df['track_album_name'].apply(analyze_sentiment)

In [None]:
# Separate 'neg', 'neu', and 'pos' values from each cell in 'sentiment_scores' column
df['neg2'] = df['sentiment'].apply(lambda x: x['neg'])
df['neu2'] = df['sentiment'].apply(lambda x: x['neu'])
df['pos2'] = df['sentiment'].apply(lambda x: x['pos'])
df['tot2'] = df['sentiment'].apply(lambda x:x['compound'])

# Drop the original 'sentiment_scores' column
df.drop('sentiment', axis=1, inplace=True)

In [None]:
df['sentiment'] = df['playlist_name'].apply(analyze_sentiment)

In [None]:
# Separate 'neg', 'neu', and 'pos' values from each cell in 'sentiment_scores' column
df['neg3'] = df['sentiment'].apply(lambda x: x['neg'])
df['neu3'] = df['sentiment'].apply(lambda x: x['neu'])
df['pos3'] = df['sentiment'].apply(lambda x: x['pos'])
df['tot3'] = df['sentiment'].apply(lambda x:x['compound'])

# Drop the original 'sentiment_scores' column
df.drop('sentiment', axis=1, inplace=True)

In [None]:
df['sentiment'] = df['track_artist'].apply(analyze_sentiment)

In [None]:
# Separate 'neg', 'neu', and 'pos' values from each cell in 'sentiment_scores' column
df['neg4'] = df['sentiment'].apply(lambda x: x['neg'])
df['neu4'] = df['sentiment'].apply(lambda x: x['neu'])
df['pos4'] = df['sentiment'].apply(lambda x: x['pos'])
df['tot4'] = df['sentiment'].apply(lambda x:x['compound'])

# Drop the original 'sentiment_scores' column
df.drop('sentiment', axis=1, inplace=True)

In [None]:
columns_to_drop = df.columns[[0, 1,2,3,4, 5,6, 8, 7,9,19,24,25]]
X = df.drop(columns_to_drop, axis=1)
X

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
one_hot_encoded_array = encoder.fit_transform(df[['playlist_subgenre', 'playlist_genre']]).toarray()
one_hot_encoded_df = pd.DataFrame(one_hot_encoded_array, columns=encoder.get_feature_names_out(['playlist_subgenre', 'playlist_genre']))
X = pd.concat([X, one_hot_encoded_df], axis=1)
X = X.drop(['playlist_subgenre', 'playlist_genre'], axis = 1)
print(X)
column_names = one_hot_encoded_df.columns
print(column_names)


In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

In [None]:
# X_train = X
# y_train = y

In [None]:
gb = GradientBoostingRegressor(random_state = 0)

In [None]:
gb.fit(X_train, y_train)

In [None]:
y_gb_pred = gb.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
mean_squared_error(y_test, y_gb_pred)

679.625664920381 - best MSE (used this model)



In [None]:
df_test = pd.read_csv('/kaggle/input/ctrlshiftintelligence-2024/test.csv')
df_test

In [None]:
df_test['sentiment'] = df_test['lyrics'].apply(analyze_sentiment)

In [None]:
# Separate 'neg', 'neu', and 'pos' values from each cell in 'sentiment_scores' column
df_test['neg'] = df_test['sentiment'].apply(lambda x: x['neg'])
df_test['neu'] = df_test['sentiment'].apply(lambda x: x['neu'])
df_test['pos'] = df_test['sentiment'].apply(lambda x: x['pos'])
df_test['tot'] = df_test['sentiment'].apply(lambda x:x['compound'])

# Drop the original 'sentiment_scores' column
df_test.drop('sentiment', axis=1, inplace=True)

In [None]:
df_test['sentiment'] = df_test['track_name'].apply(analyze_sentiment)

In [None]:
# Separate 'neg', 'neu', and 'pos' values from each cell in 'sentiment_scores' column
df_test['neg1'] = df_test['sentiment'].apply(lambda x: x['neg'])
df_test['neu1'] = df_test['sentiment'].apply(lambda x: x['neu'])
df_test['pos1'] = df_test['sentiment'].apply(lambda x: x['pos'])
df_test['tot1'] = df_test['sentiment'].apply(lambda x:x['compound'])

# Drop the original 'sentiment_scores' column
df_test.drop('sentiment', axis=1, inplace=True)

In [None]:
df_test['sentiment'] = df_test['track_album_name'].apply(analyze_sentiment)

In [None]:
# Separate 'neg', 'neu', and 'pos' values from each cell in 'sentiment_scores' column
df_test['neg2'] = df_test['sentiment'].apply(lambda x: x['neg'])
df_test['neu2'] = df_test['sentiment'].apply(lambda x: x['neu'])
df_test['pos2'] = df_test['sentiment'].apply(lambda x: x['pos'])
df_test['tot2'] = df_test['sentiment'].apply(lambda x:x['compound'])

# Drop the original 'sentiment_scores' column
df_test.drop('sentiment', axis=1, inplace=True)

In [None]:
df_test['sentiment'] = df_test['playlist_name'].apply(analyze_sentiment)

In [None]:
# Separate 'neg', 'neu', and 'pos' values from each cell in 'sentiment_scores' column
df_test['neg3'] = df_test['sentiment'].apply(lambda x: x['neg'])
df_test['neu3'] = df_test['sentiment'].apply(lambda x: x['neu'])
df_test['pos3'] = df_test['sentiment'].apply(lambda x: x['pos'])
df_test['tot3'] = df_test['sentiment'].apply(lambda x:x['compound'])

# Drop the original 'sentiment_scores' column
df_test.drop('sentiment', axis=1, inplace=True)

In [None]:
df_test['sentiment'] = df_test['track_artist'].apply(analyze_sentiment)

In [None]:
# Separate 'neg', 'neu', and 'pos' values from each cell in 'sentiment_scores' column
df_test['neg4'] = df_test['sentiment'].apply(lambda x: x['neg'])
df_test['neu4'] = df_test['sentiment'].apply(lambda x: x['neu'])
df_test['pos4'] = df_test['sentiment'].apply(lambda x: x['pos'])
df_test['tot4'] = df_test['sentiment'].apply(lambda x:x['compound'])

# Drop the original 'sentiment_scores' column
df_test.drop('sentiment', axis=1, inplace=True)

In [None]:
columns_to_drop = df_test.columns[[0, 1,2,3,4, 5,6 , 7,8,9,19,24]]
X_test = df_test.drop(columns_to_drop, axis=1)
X_test

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
one_hot_encoded_array = encoder.fit_transform(df_test[['playlist_subgenre', 'playlist_genre']]).toarray()
one_hot_encoded_df = pd.DataFrame(one_hot_encoded_array, columns=encoder.get_feature_names_out(['playlist_subgenre', 'playlist_genre']))
X_test = pd.concat([X_test, one_hot_encoded_df], axis=1)
X_test = X_test.drop(['playlist_subgenre', 'playlist_genre'], axis = 1)
print(X_test)
column_names = one_hot_encoded_df.columns
print(column_names)


In [None]:
y_gb_test_pred = gb.predict(X_test)

In [None]:
finaldf2 = pd.DataFrame(y_gb_test_pred, columns = ['target'])
finaldf2

In [None]:
finaldf2.to_csv("Submission2.csv")