<a href="https://colab.research.google.com/github/pawara101/pyspark-trials/blob/main/pyspark_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [44]:
! pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [45]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.functions import lit

In [46]:
spark = SparkSession.builder.appName("test").getOrCreate()

In [47]:
train_df = spark.read.csv('/content/drive/MyDrive/Colab Notebooks/pyspark/tcc_ceds_music.csv', header=True,)

In [48]:
train_df.show()

+---+--------------------+--------------------+------------+-----+--------------------+---+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-------------------+-------------------+--------------------+-------------------+-------------------+----------+------------------+
|_c0|         artist_name|          track_name|release_date|genre|              lyrics|len|              dating|            violence|          world/life|          night/time|  shake the audience|       family/gospel|            romantic|       communication|             obscene|               music|     movement/places|light/visual perceptions|    family/spiritual|          like/girls|             sadness|       

In [49]:
train_df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- lyrics: string (nullable = true)
 |-- len: string (nullable = true)
 |-- dating: string (nullable = true)
 |-- violence: string (nullable = true)
 |-- world/life: string (nullable = true)
 |-- night/time: string (nullable = true)
 |-- shake the audience: string (nullable = true)
 |-- family/gospel: string (nullable = true)
 |-- romantic: string (nullable = true)
 |-- communication: string (nullable = true)
 |-- obscene: string (nullable = true)
 |-- music: string (nullable = true)
 |-- movement/places: string (nullable = true)
 |-- light/visual perceptions: string (nullable = true)
 |-- family/spiritual: string (nullable = true)
 |-- like/girls: string (nullable = true)
 |-- sadness: string (nullable = true)
 |-- feelings: string (nullable = true)
 |-- danceability: string (null

In [50]:
dataframes = train_df.randomSplit([0.8, 0.2], seed=26)

In [51]:
train = dataframes[0]
test = dataframes[1]

In [52]:
dataframes[0].count()

22654

In [53]:
dataframes[0].show()

+-----+-----------------+--------------------+------------+-----+--------------------+---+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-------------------+--------------------+--------------------+-------------------+-------------------+----------+------------------+
|  _c0|      artist_name|          track_name|release_date|genre|              lyrics|len|              dating|            violence|          world/life|          night/time|  shake the audience|       family/gospel|            romantic|       communication|             obscene|               music|     movement/places|light/visual perceptions|    family/spiritual|          like/girls|             sadness|        

In [54]:
train_df.select('genre').distinct().collect()

[Row(genre='pop'),
 Row(genre='blues'),
 Row(genre='country'),
 Row(genre='jazz'),
 Row(genre='hip hop'),
 Row(genre='rock'),
 Row(genre='reggae')]

In [55]:
train_df.schema.names

['_c0',
 'artist_name',
 'track_name',
 'release_date',
 'genre',
 'lyrics',
 'len',
 'dating',
 'violence',
 'world/life',
 'night/time',
 'shake the audience',
 'family/gospel',
 'romantic',
 'communication',
 'obscene',
 'music',
 'movement/places',
 'light/visual perceptions',
 'family/spiritual',
 'like/girls',
 'sadness',
 'feelings',
 'danceability',
 'loudness',
 'acousticness',
 'instrumentalness',
 'valence',
 'energy',
 'topic',
 'age']

In [56]:
train_df.dtypes

[('_c0', 'string'),
 ('artist_name', 'string'),
 ('track_name', 'string'),
 ('release_date', 'string'),
 ('genre', 'string'),
 ('lyrics', 'string'),
 ('len', 'string'),
 ('dating', 'string'),
 ('violence', 'string'),
 ('world/life', 'string'),
 ('night/time', 'string'),
 ('shake the audience', 'string'),
 ('family/gospel', 'string'),
 ('romantic', 'string'),
 ('communication', 'string'),
 ('obscene', 'string'),
 ('music', 'string'),
 ('movement/places', 'string'),
 ('light/visual perceptions', 'string'),
 ('family/spiritual', 'string'),
 ('like/girls', 'string'),
 ('sadness', 'string'),
 ('feelings', 'string'),
 ('danceability', 'string'),
 ('loudness', 'string'),
 ('acousticness', 'string'),
 ('instrumentalness', 'string'),
 ('valence', 'string'),
 ('energy', 'string'),
 ('topic', 'string'),
 ('age', 'string')]

In [57]:
## Select Feature Columns
feature_cols = ['artist_name',
 'track_name',
 'release_date',
 'genre',
 'lyrics',
 'len',
 'dating',
 'violence',
 'world/life',
 'night/time',
 'shake the audience',
 'family/gospel',
 'romantic',
 'communication',
 'obscene',
 'music',
 'movement/places',
 'light/visual perceptions',
 'family/spiritual',
 'like/girls',
 'sadness',
 'feelings',
 'danceability',
 'loudness',
 'acousticness',
 'instrumentalness',
 'valence',
 'energy',
 'topic',
 'age']

In [58]:
## Create Label Column
train=train.withColumn("label",lit(0))

In [60]:
# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer, RegexTokenizer
from pyspark.ml.feature import StopWordsRemover
tokenizer_1 = RegexTokenizer(inputCol="lyrics", outputCol="lyrics_words",pattern="\\W")

lyric_words = tokenizer_1.transform(train)
lyric_words.show(10)

+-----+---------------+--------------------+------------+-----+--------------------+---+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-------------------+--------------------+--------------------+-------------------+-------------------+----------+------------------+-----+--------------------+
|  _c0|    artist_name|          track_name|release_date|genre|              lyrics|len|              dating|            violence|          world/life|          night/time|  shake the audience|       family/gospel|            romantic|       communication|             obscene|               music|     movement/places|light/visual perceptions|    family/spiritual|          like/girls|      

In [62]:
tokenizer_2 = RegexTokenizer(inputCol="track_name", outputCol="track_words",pattern="\\W")

track_words = tokenizer_2.transform(lyric_words)
track_words.show(10)

+-----+---------------+--------------------+------------+-----+--------------------+---+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-------------------+--------------------+--------------------+-------------------+-------------------+----------+------------------+-----+--------------------+--------------------+
|  _c0|    artist_name|          track_name|release_date|genre|              lyrics|len|              dating|            violence|          world/life|          night/time|  shake the audience|       family/gospel|            romantic|       communication|             obscene|               music|     movement/places|light/visual perceptions|    family/spiritual|      

In [63]:
swr1 = StopWordsRemover(inputCol = 'lyrics_words', outputCol = 'lyrics_words_sw_removed')
track_swr_1 = swr1.transform(track_words)
track_swr_1.show(10)

+-----+---------------+--------------------+------------+-----+--------------------+---+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-------------------+--------------------+--------------------+-------------------+-------------------+----------+------------------+-----+--------------------+--------------------+-----------------------+
|  _c0|    artist_name|          track_name|release_date|genre|              lyrics|len|              dating|            violence|          world/life|          night/time|  shake the audience|       family/gospel|            romantic|       communication|             obscene|               music|     movement/places|light/visual perceptions|   

In [65]:
swr2 = StopWordsRemover(inputCol = 'track_words', outputCol = 'track_words_sw_removed')
track_swr_2 = swr2.transform(track_swr_1)
track_swr_2.show(10)

+-----+---------------+--------------------+------------+-----+--------------------+---+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-------------------+--------------------+--------------------+-------------------+-------------------+----------+------------------+-----+--------------------+--------------------+-----------------------+----------------------+
|  _c0|    artist_name|          track_name|release_date|genre|              lyrics|len|              dating|            violence|          world/life|          night/time|  shake the audience|       family/gospel|            romantic|       communication|             obscene|               music|     movement/places|light

In [76]:
from pyspark.ml.feature import Word2Vec

word2vec = Word2Vec(vectorSize = 10, inputCol = 'lyrics_words_sw_removed', outputCol = 'result_lyrics')
model1 = word2vec.fit(track_swr_2)
result1 = model1.transform(track_swr_2)

result1.show(10)

+-----+---------------+--------------------+------------+-----+--------------------+---+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-------------------+--------------------+--------------------+-------------------+-------------------+----------+------------------+-----+--------------------+--------------------+-----------------------+----------------------+--------------------+
|  _c0|    artist_name|          track_name|release_date|genre|              lyrics|len|              dating|            violence|          world/life|          night/time|  shake the audience|       family/gospel|            romantic|       communication|             obscene|               music|     

In [None]:
result1.select('result_lyrics').distinct().collect()

In [80]:
word2vec2 = Word2Vec(vectorSize = 10, inputCol = 'track_words_sw_removed', outputCol = 'result_track')
model2 = word2vec2.fit(result1)
result2 = model2.transform(result1)

result2.show(10)

+-----+---------------+--------------------+------------+-----+--------------------+---+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-------------------+--------------------+--------------------+-------------------+-------------------+----------+------------------+-----+--------------------+--------------------+-----------------------+----------------------+--------------------+--------------------+
|  _c0|    artist_name|          track_name|release_date|genre|              lyrics|len|              dating|            violence|          world/life|          night/time|  shake the audience|       family/gospel|            romantic|       communication|             obscene|     

In [99]:
# Fit the pipeline to training dataset.
finalised_data = result2.select('result_lyrics','result_track','len','age','genre','label')

In [100]:
finalised_data.show()

+--------------------+--------------------+---+------------------+-----+-----+
|       result_lyrics|        result_track|len|               age|genre|label|
+--------------------+--------------------+---+------------------+-----+-----+
| [-0.17271825463364]|               [0.0]| 95|               1.0|  pop|    0|
|[-0.0788580555146...|[0.08703754544258...|104|0.9571428571428572|  pop|    0|
|[-0.1323773156015...|[0.17371466010808...|108|0.4714285714285714|  pop|    0|
|[-0.199402712165348]|[-0.2400175631046...| 62|0.4714285714285714|  pop|    0|
|[-0.2174147513295...|[0.20619826391339...| 35|0.8571428571428571|  pop|    0|
|[-0.1380967511278...|[-0.1761963590979...| 36|0.4714285714285714|  pop|    0|
|[-0.1680768328703...|               [0.0]| 61|0.4714285714285714|  pop|    0|
|[-0.1165624487524...|[0.1281331479549408]| 66|0.4714285714285714|  pop|    0|
|[-0.1250098692253...|               [0.0]| 16|0.4714285714285714|  pop|    0|
|[-0.1461259101331...|[-0.0460347831249...| 50|0.471

In [101]:
finalized_features=finalised_data.schema.names

In [102]:
## Logistic Regression
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression( featuresCol='result_lyrics',labelCol="label")
lrn = lr.fit(finalised_data)

In [None]:
train.show('result_lyrics','result_track')

+-----+-----------------+--------------------+------------+-----+--------------------+---+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-------------------+--------------------+--------------------+-------------------+-------------------+----------+------------------+-----+
|  _c0|      artist_name|          track_name|release_date|genre|              lyrics|len|              dating|            violence|          world/life|          night/time|  shake the audience|       family/gospel|            romantic|       communication|             obscene|               music|     movement/places|light/visual perceptions|    family/spiritual|          like/girls|             sadness|  

In [None]:
tokenized = tokenizer_1.transform(train)
tokenized.select("genre","lyrics_words").show()

+-----+--------------------+
|genre|               words|
+-----+--------------------+
|  pop|[hold, time, feel...|
|  pop|[yeah, dance, cei...|
|  pop|[real, leftwinger...|
|  pop|[bring, bring, st...|
|  pop|[darling, hunger,...|
|  pop|[know, know, go, ...|
|  pop|[cool, anymore, t...|
|  pop|[night, motorcycl...|
|  pop|[collapse, lung, ...|
|  pop|[want, hand, want...|
|  pop|[away, star, shoo...|
|  pop|[head, shoulder, ...|
|  pop|[life, crazy, bod...|
|  pop|[wake, morning, c...|
|  pop|[rabbit, fall, aw...|
|  pop|[fly, face, fashi...|
|  pop|[whisper, whisper...|
|  pop|[head, border, mi...|
|  pop|[like, look, go, ...|
|  pop|[kick, kick, kick...|
+-----+--------------------+
only showing top 20 rows



'words'