# Importar as bibliotecas

In [59]:
from pyspark.sql import SparkSession

from pyspark.ml.regression import LinearRegression

In [3]:
# para funcionar preciso ter o spark instalado e configurado(variáveis de ambiente) no computador
contexto = SparkSession.builder.appName("regressao_linear1").getOrCreate()

In [5]:
contexto

In [16]:
# Spark  pode ler arquivos locais sem necessitar de hdfs
# Mas também está preparado para ler de um sistema de arquivos hdfs

# header=True para habilitar o cabeçalho
# inferSchema=True  inferir os tipos de dados
dados = contexto.read.csv("Ecommerce_Customers.csv", header=True, inferSchema=True)

In [17]:
dados.show()

+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|
|   hduke@hotmail.com|4547 Archer Commo...|       DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|
|    pallen@yahoo.com|24645 Valerie Uni...|          Bisque|33.000914755642675|11.330278057777512|37.110597442120856|   4.104543202376424| 487.54750486747207|
|riverarebecca@gma...|1414 David Throug...|   

In [18]:
dados.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [20]:
dados.head()

Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005)

In [22]:
for campo in dados.head():
    print(campo)

mstephenson@fernandez.com
835 Frank TunnelWrightmouth, MI 82180-9605
Violet
34.49726772511229
12.65565114916675
39.57766801952616
4.0826206329529615
587.9510539684005


# Iremos organizar os dados para passar para a lib de machine learning do spark

antes deste ponto em um trabalho o cientista de dados precisaria limpeza nos dados

label      será o target aquilo que desejo prever

features   entradas

In [25]:
# label Yearly Amount Spent

# entrada / features <-->  'Yearly Amount Spent'
#  'Avg Session Length', 'Time on App','Time on Website',  'Length of Membership'

dados.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [26]:
# para organizar os dados em labels(rótulos) e features(entradas) iremos usar o VectorAssembler
from pyspark.ml.feature import VectorAssembler

In [66]:
# no parâmetro inputCols você irá colocar os campos que serão entrada para o algoritmo de regressão linear
# no parâmetro outputCol é o nome que irei dar para este vetor
vector_assembler = VectorAssembler(inputCols=['Avg Session Length', 'Time on App','Time on Website',  'Length of Membership'],
               outputCol='features')

In [30]:
vector_assembler

VectorAssembler_250f2e840e03

In [67]:
dados_transformados = vector_assembler.transform(dados)

In [68]:
dados_transformados.show()

+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+--------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|            features|
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+--------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|[34.4972677251122...|
|   hduke@hotmail.com|4547 Archer Commo...|       DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|[31.9262720263601...|
|    pallen@yahoo.com|24645 Valerie Uni...|          Bisque|33.000914755642675|11.330278057777512|37

In [69]:
# caso queira ver determinado campo do conjunto de dados
dados_transformados.select("features").show()

+--------------------+
|            features|
+--------------------+
|[34.4972677251122...|
|[31.9262720263601...|
|[33.0009147556426...|
|[34.3055566297555...|
|[33.3306725236463...|
|[33.8710378793419...|
|[32.0215955013870...|
|[32.7391429383803...|
|[33.9877728956856...|
|[31.9365486184489...|
|[33.9925727749537...|
|[33.8793608248049...|
|[29.5324289670579...|
|[33.1903340437226...|
|[32.3879758531538...|
|[30.7377203726281...|
|[32.1253868972878...|
|[32.3388993230671...|
|[32.1878120459321...|
|[32.6178560628234...|
+--------------------+
only showing top 20 rows



Já realizar uma transformação de dados para um vetor

precisamos passar para o algoritmo de regressão linear do spark o vetor que colocamos o nome de entradas 
e o alvo/target que é Yearly Amount Spent

In [70]:
dados_transformados_final = dados_transformados.select("features", "Yearly Amount Spent")

In [71]:
dados_transformados_final.show()

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.4972677251122...|  587.9510539684005|
|[31.9262720263601...|  392.2049334443264|
|[33.0009147556426...| 487.54750486747207|
|[34.3055566297555...|  581.8523440352177|
|[33.3306725236463...|  599.4060920457634|
|[33.8710378793419...|   637.102447915074|
|[32.0215955013870...|  521.5721747578274|
|[32.7391429383803...|  549.9041461052942|
|[33.9877728956856...|  570.2004089636196|
|[31.9365486184489...|  427.1993848953282|
|[33.9925727749537...|  492.6060127179966|
|[33.8793608248049...|  522.3374046069357|
|[29.5324289670579...|  408.6403510726275|
|[33.1903340437226...|  573.4158673313865|
|[32.3879758531538...|  470.4527333009554|
|[30.7377203726281...|  461.7807421962299|
|[32.1253868972878...| 457.84769594494855|
|[32.3388993230671...| 407.70454754954415|
|[32.1878120459321...|  452.3156754800354|
|[32.6178560628234...|   605.061038804892|
+----------

# Divisão entre dados de treino e teste

In [72]:
# iremos utilizar randomSplit para dividir 70%(0.7) para dados de treino e 30%(0.3) para dados de teste
dados_treino, dados_teste = dados_transformados_final.randomSplit([0.7, 0.3])

In [73]:
dados_treino.show()

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[30.3931845423455...|  319.9288698031936|
|[30.5743636841713...| 442.06441375806565|
|[30.8162006488763...|   266.086340948469|
|[30.8364326747734...|  467.5019004269896|
|[30.8794843441274...|  490.2065999848547|
|[30.9716756438877...|  494.6386097568927|
|[31.0472221394875...|  392.4973991890214|
|[31.0613251567161...|  487.5554580579016|
|[31.0662181616375...| 448.93329320767435|
|[31.1280900496166...|  557.2526867470547|
|[31.1695067987115...|  427.3565308022928|
|[31.2606468698795...|  421.3266312569514|
|[31.2834474760581...|  591.7810894256675|
|[31.3091926408918...|  432.7207178399336|
|[31.3123495994443...|  463.5914180279406|
|[31.3584771924370...|  495.1759504494754|
|[31.4252268808548...|  530.7667186547619|
|[31.4459724827577...| 484.87696493512857|
|[31.4474464941278...|   418.602742095224|
|[31.5147378578019...|  489.8124879964614|
+----------

In [74]:
dados_teste.show()

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[29.5324289670579...|  408.6403510726275|
|[30.4925366965402...|  282.4712457199145|
|[30.7377203726281...|  461.7807421962299|
|[31.1239743499119...|  486.9470538397658|
|[31.2681042107507...|  423.4705331738239|
|[31.3662121671876...|  430.5888825564849|
|[31.3895854806643...|  410.0696110599829|
|[31.6253601348306...|  376.3369007569242|
|[31.6610498227460...| 416.35835357990084|
|[31.7207699002873...|   538.774933478023|
|[31.7216523605090...| 347.77692663187264|
|[31.7242025238451...|  503.3878872879605|
|[31.8124825597242...|  392.8103449837972|
|[31.8627411090001...|  556.2981411740467|
|[31.8648325480987...|  439.8912804768137|
|[31.8745516945853...|  392.2852442462675|
|[31.8854062999117...|  390.1032729724755|
|[31.9120759292006...|  387.5347163057077|
|[32.0180740106320...|  357.7831107453153|
|[32.0215955013870...|  521.5721747578274|
+----------

In [75]:
dados_treino.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                359|
|   mean| 499.23505976449644|
| stddev|  78.59501298339123|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [76]:
dados_teste.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                141|
|   mean|  499.5151253463915|
| stddev|  81.40241656693746|
|    min|  282.4712457199145|
|    max|  744.2218671047146|
+-------+-------------------+



# Vamos aplicar um modelo de regressão linear usando o spark

In [62]:
# importar classe de regressão linear no spark
from pyspark.ml.regression import LinearRegression

In [77]:
# vamos instanciar o modelo de regressão linear
# labelCol você informa a coluna alvo/target
modelo_regressao_linear = LinearRegression(labelCol='Yearly Amount Spent')

# Vamos treinar o modelo 

In [110]:
# irei utilizar o modelo criado modelo_regressao_linear para treinar
# por padrão o nome da coluna  dos dados treino que é um vetor a classe espera "features"
modelo_regressao_linear_treinado = modelo_regressao_linear.fit(dados_treino)

In [111]:
modelo_regressao_linear_treinado.coefficients

DenseVector([25.9829, 39.0132, 0.3182, 61.8802])

In [112]:
modelo_regressao_linear_treinado.intercept

-1060.3077088019595

In [119]:
print("coeficientes ", modelo_regressao_linear_treinado.coefficients)

coeficientes  [25.982906821106845,39.013181085247275,0.3181535108829295,61.88018008111657]


In [None]:
modelo_regressao_linear_treinado.coefficients=[25.982906821106845,39.013181085247275,0.3181535108829295,61.88018008111657]

In [86]:
print("intercept ", modelo_regressao_linear_treinado.intercept)

intercept  -1060.3077088019595


In [94]:
dados_teste.show()

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[29.5324289670579...|  408.6403510726275|
|[30.4925366965402...|  282.4712457199145|
|[30.7377203726281...|  461.7807421962299|
|[31.1239743499119...|  486.9470538397658|
|[31.2681042107507...|  423.4705331738239|
|[31.3662121671876...|  430.5888825564849|
|[31.3895854806643...|  410.0696110599829|
|[31.6253601348306...|  376.3369007569242|
|[31.6610498227460...| 416.35835357990084|
|[31.7207699002873...|   538.774933478023|
|[31.7216523605090...| 347.77692663187264|
|[31.7242025238451...|  503.3878872879605|
|[31.8124825597242...|  392.8103449837972|
|[31.8627411090001...|  556.2981411740467|
|[31.8648325480987...|  439.8912804768137|
|[31.8745516945853...|  392.2852442462675|
|[31.8854062999117...|  390.1032729724755|
|[31.9120759292006...|  387.5347163057077|
|[32.0180740106320...|  357.7831107453153|
|[32.0215955013870...|  521.5721747578274|
+----------

In [95]:
# selecionar apenas a coluna features
dados_teste_coluna_features = dados_teste.select("features")

# Predição

In [116]:
predicoes = modelo_regressao_linear_treinado.transform(dados_teste_coluna_features)

In [115]:
predicoes.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[29.5324289670579...| 396.9645277191571|
|[30.4925366965402...|286.21273072750637|
|[30.7377203726281...| 450.6475189013913|
|[31.1239743499119...|508.32983362374944|
|[31.2681042107507...| 426.8376340433779|
|[31.3662121671876...| 426.0159969315321|
|[31.3895854806643...|408.49660359296786|
|[31.6253601348306...|380.17468826890286|
|[31.6610498227460...| 416.5733186515465|
|[31.7207699002873...| 545.5406731297267|
|[31.7216523605090...| 348.5266552023809|
|[31.7242025238451...| 509.5473577083476|
|[31.8124825597242...| 395.7100909922044|
|[31.8627411090001...| 558.4196201916977|
|[31.8648325480987...|449.94074923567314|
|[31.8745516945853...|397.12393334190483|
|[31.8854062999117...|  398.249315662541|
|[31.9120759292006...|388.69111941511915|
|[32.0180740106320...| 338.9717140856819|
|[32.0215955013870...| 516.7246113349081|
+--------------------+------------

# Avaliar o resultado 

In [102]:
avaliacao_modelo = modelo_regressao_linear_treinado.evaluate(dados_teste)

In [104]:
avaliacao_modelo.rootMeanSquaredError

9.978169074383096

In [106]:
avaliacao_modelo.meanSquaredError

99.56385807697522

In [108]:
avaliacao_modelo.meanAbsoluteError

7.8871652108750565

In [109]:
print("RMSE ", avaliacao_modelo.rootMeanSquaredError)
print("MSE ", avaliacao_modelo.meanSquaredError )
print("MAE ", avaliacao_modelo.meanAbsoluteError )

RMSE  9.978169074383096
MSE  99.56385807697522
MAE  7.8871652108750565
