In [None]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Spark Trial").config("spark.memory.offHeap.enabled","true").config("spark.memory.offHeap.size","10g").getOrCreate()

In [None]:
df = spark.read.csv('online_retail.csv',header=True,escape="\"")
df.show(5,0)

In [None]:
df.count()  

In [None]:
df.select('CustomerID').distinct().count() 

In [None]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

df.groupBy('Country').agg(countDistinct('CustomerID').alias('country_count')).show()

In [None]:
df.groupBy('Country').agg(countDistinct('CustomerID').alias('country_count')).orderBy(desc('country_count')).show()

In [None]:
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
df = df.withColumn('date',to_timestamp("InvoiceDate", 'dd/MM/yy HH:mm'))
df.select(max("date")).show()
df.select("InvoiceDate", "date").show(10, truncate=False)


In [None]:
df.select(min("date")).show()



In [None]:
df = df.withColumn("from_date", lit("12/01/10 08:26"))
df = df.withColumn('from_date',to_timestamp("from_date", 'dd/MM/yy HH:mm'))

df2 = df.withColumn('from_date',to_timestamp(col('from_date'))).withColumn('recency',col("date").cast("long") - col('from_date').cast("long"))

In [None]:
df2 = df2.join(df2.groupBy('CustomerID').agg(max('recency').alias('recency')),on='recency',how='leftsemi')



In [None]:
df2.show(5)

In [None]:
df2.printSchema()



In [None]:
df_freq = df2.groupBy('CustomerID').agg(count('InvoiceDate').alias('frequency'))

In [None]:
df_freq.show(5,0)



In [None]:
df3 = df2.join(df_freq,on='CustomerID',how='inner')
df3.printSchema()




In [None]:
m_val = m_val.groupBy('CustomerID').agg(sum('TotalAmount').alias('monetary_value'))




In [None]:
finaldf = m_val.join(df3,on='CustomerID',how='inner')



In [None]:
finaldf = finaldf.select(['recency','frequency','monetary_value','CustomerID']).distinct()


finaldf.printSchema()


In [None]:
finaldf = finaldf.select(['recency','frequency','monetary_value','CustomerID']).distinct()
print(finaldf[finaldf['monetary_value'] < 0])



In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler

assemble=VectorAssembler(inputCols=[
    'recency','frequency','monetary_value'
], outputCol='features')

assembled_data=assemble.transform(finaldf)

scale=StandardScaler(inputCol='features',outputCol='standardized')
data_scale=scale.fit(assembled_data)
data_scale_output=data_scale.transform(assembled_data)

In [None]:
data_scale_output.select('standardized').show(2,truncate=False)


In [None]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
import numpy as np

cost = np.zeros(10)

evaluator = ClusteringEvaluator(predictionCol='prediction', featuresCol='standardized',metricName='silhouette', distanceMeasure='squaredEuclidean')

for i in range(2,10):
    KMeans_algo=KMeans(featuresCol='standardized', k=i)
    KMeans_fit=KMeans_algo.fit(data_scale_output)
    output=KMeans_fit.transform(data_scale_output)
    cost[i] = KMeans_fit.summary.trainingCost

In [None]:
import pandas as pd
import pylab as pl
df_cost = pd.DataFrame(cost[2:])
df_cost.columns = ["cost"]
new_col = range(2,10)
df_cost.insert(0, 'cluster', new_col)
pl.plot(df_cost.cluster, df_cost.cost)
pl.xlabel('Number of Clusters')
pl.ylabel('Score')
pl.title('Elbow Curve')
pl.show()

In [None]:
KMeans_algo=KMeans(featuresCol='standardized', k=4)
KMeans_fit=KMeans_algo.fit(data_scale_output)

In [None]:
preds=KMeans_fit.transform(data_scale_output)

preds.show(5,0)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

df_viz = preds.select('recency','frequency','monetary_value','prediction')
df_viz = df_viz.toPandas()
avg_df = df_viz.groupby(['prediction'], as_index=False).mean()

list1 = ['recency','frequency','monetary_value']

for i in list1:
    sns.barplot(x='prediction',y=str(i),data=avg_df)
    plt.show()

In [None]:

"""SimpleApp.py"""
from pyspark.sql import SparkSession

logFile = "untitled.txt"  # Should be some file on your system
spark = SparkSession.builder.appName("SimpleApp").getOrCreate()
logData = spark.read.text(logFile).cache()
numAs = logData.filter(logData.value.contains('a')).count()
numBs = logData.filter(logData.value.contains('b')).count()

words = logData.flatMap(lambda logData : logData.split(" "))
print("Lines with a: %i, lines with b: %i" % (numAs, numBs))

spark.stop()
