In [1]:
%%bash
apt-get install openjdk-17-jdk-headless -qq > /dev/null

In [2]:
%%bash
wget -q https://archive.apache.org/dist/spark/spark-3.3.0/spark-3.3.0-bin-hadoop3.tgz 
tar xf spark-3.3.0-bin-hadoop3.tgz

In [3]:
import os
os.environ['JAVA_HOME']='/usr/lib/jvm/java-1.17.0-openjdk-amd64'
os.environ['SPARK_HOME']='/content/spark-3.3.0-bin-hadoop3'

In [4]:
%%bash
pip install findspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [5]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [6]:
import findspark
findspark.init()

In [7]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').getOrCreate()

In [8]:
df = spark.read.csv('/content/gdrive/My Drive/DS_2019_public.csv',
                    inferSchema=True,header=True)

In [9]:
df.show(5)

+------------------+--------+-----------------+--------+-----------+--------+--------+-----------+-----------+-----------+------+-----------+--------+--------+-----+--------+--------+-----------+---------+--------+--------+--------+--------+-------+------+-----------+--------+-----------+-----+--------+--------+--------+------+--------+------+-----+--------+--------+--------+--------+--------+--------+-----+-----------+--------+--------+-------+-----------+--------+--------+------------+---------+---------+---------+-------+----------+--------+--------+-------+--------+--------+--------+---------+------+-------+-------+--------+--------+--------+---------+--------+--------+--------+--------+--------+-------+------------+---------+-----------+-----------+--------+--------+----+------+-------+--------+-----+----+-------+-------+-----------+--------+-------+--------+-----------+---------+--------+---------+-----------+--------+-----+--------+---------+-----+-----+--------+-----------+----

In [10]:
df.printSchema()

root
 |-- Climate_Region_Pub: integer (nullable = true)
 |-- DIVISION: integer (nullable = true)
 |-- REPORTABLE_DOMAIN: integer (nullable = true)
 |-- DOLELCOL: string (nullable = true)
 |-- TOTALDOLCOL: integer (nullable = true)
 |-- KWHCOL: double (nullable = true)
 |-- BTUELCOL: double (nullable = true)
 |-- TOTALBTUCOL: integer (nullable = true)
 |-- TOTALDOLSPH: integer (nullable = true)
 |-- TOTALBTUSPH: integer (nullable = true)
 |-- CELLAR: integer (nullable = true)
 |-- NWEIGHT: double (nullable = true)
 |-- TOTHSQFT: integer (nullable = true)
 |-- HEATHOME: integer (nullable = true)
 |-- NUMPC: integer (nullable = true)
 |-- DOLLAREL: integer (nullable = true)
 |-- DOLELOTH: double (nullable = true)
 |-- CUFEETNGSPH: double (nullable = true)
 |-- BTUNGSPH: double (nullable = true)
 |-- DOLNGSPH: double (nullable = true)
 |-- TEMPHOME: integer (nullable = true)
 |-- TOTCSQFT: integer (nullable = true)
 |-- BTUFOSPH: double (nullable = true)
 |-- AIRCOND: integer (nullable = t

In [11]:
df.columns

['Climate_Region_Pub',
 'DIVISION',
 'REPORTABLE_DOMAIN',
 'DOLELCOL',
 'TOTALDOLCOL',
 'KWHCOL',
 'BTUELCOL',
 'TOTALBTUCOL',
 'TOTALDOLSPH',
 'TOTALBTUSPH',
 'CELLAR',
 'NWEIGHT',
 'TOTHSQFT',
 'HEATHOME',
 'NUMPC',
 'DOLLAREL',
 'DOLELOTH',
 'CUFEETNGSPH',
 'BTUNGSPH',
 'DOLNGSPH',
 'TEMPHOME',
 'TOTCSQFT',
 'BTUFOSPH',
 'AIRCOND',
 'ELCOOL',
 'GALLONFOSPH',
 'WALLTYPE',
 'TOTALDOLOTH',
 'BTUFO',
 'GALLONFO',
 'DOLFOSPH',
 'DOLLARFO',
 'KWHSPH',
 'BTUELSPH',
 'FOWARM',
 'USEFO',
 'TOTUSQFT',
 'TOTALDOL',
 'NUMTHERM',
 'DOLELSPH',
 'CONCRETE',
 'CUFEETNG',
 'BTUNG',
 'GALLONFOOTH',
 'BTUFOOTH',
 'BEDROOMS',
 'FOWATER',
 'GALLONFOWTH',
 'BTUFOWTH',
 'DOLFOWTH',
 'GALLONKEROTH',
 'BTUKEROTH',
 'DOLKEROTH',
 'TOTUCSQFT',
 'TOTSQFT',
 'TOTSQFT_EN',
 'DOLNGWTH',
 'DOLFOOTH',
 'ELOTHER',
 'DOLLARNG',
 'FUELHEAT',
 'NOTMOIST',
 'GALLONKER',
 'BTUKER',
 'NUMCFAN',
 'FOILAUX',
 'TOTALBTU',
 'COOLTYPE',
 'DOOR1SUM',
 'DOLLARKER',
 'DOLLPOTH',
 'DOLELWTH',
 'KWHWTH',
 'BTUELWTH',
 'TOTROOMS',
 

In [12]:
print(df.count(),len(df.columns))

10875 121


In [13]:
df.groupby('Climate_Region_Pub').count().show()

+------------------+-----+
|Climate_Region_Pub|count|
+------------------+-----+
|                 1| 3593|
|                 3| 1957|
|                 5|  612|
|                 4| 3169|
|                 2| 1544|
+------------------+-----+



In [14]:
df_study = df.filter((df['Climate_Region_Pub']==1)|(df['Climate_Region_Pub']==2)|(df['Climate_Region_Pub']==3)|(df['Climate_Region_Pub']==4)|(df['Climate_Region_Pub']==5))
df_study.groupby('Climate_Region_Pub').count().orderBy('Climate_Region_Pub').show()

+------------------+-----+
|Climate_Region_Pub|count|
+------------------+-----+
|                 1| 3593|
|                 2| 1544|
|                 3| 1957|
|                 4| 3169|
|                 5|  612|
+------------------+-----+



In [15]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

def inTakeReasonCoding(intakereason):
  if intakereason == 3:
    code_of_reason = 0
  else:
    code_of_reason = 1
  return code_of_reason

In [16]:
reason_coding_udf = udf(inTakeReasonCoding,IntegerType())

df_study = df_study.withColumn('Climate_Region_Pub',reason_coding_udf(df_study['Climate_Region_Pub']))
df_study.groupby('Climate_Region_Pub').count().show()

+------------------+-----+
|Climate_Region_Pub|count|
+------------------+-----+
|                 1| 8918|
|                 0| 1957|
+------------------+-----+



In [17]:
summary_data = df_study.describe()
summary_data.show()

+-------+------------------+------------------+-----------------+------------------+------------------+------------------+----------------+-----------------+-----------------+-----------------+--------------------+-----------------+------------------+-------------------+------------------+------------------+-----------------+------------------+-----------------+------------------+------------------+------------------+------------------+-------------------+-------------------+-----------------+------------------+-----------------+------------------+-----------------+------------------+------------------+------------------+-----------------+-------------------+-------------------+------------------+------------------+------------------+------------------+--------------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+-----------------+-----------------+------------------+-------------------+------------------+---------

In [18]:
input_features = ['DIVISION',
 'REPORTABLE_DOMAIN','TOTALDOLCOL','KWHCOL','BTUELCOL',
 'TOTALBTUCOL','TOTALDOLSPH','TOTALBTUSPH',
 'CELLAR','NWEIGHT','TOTHSQFT','HEATHOME',
 'NUMPC','DOLLAREL','DOLELOTH','CUFEETNGSPH',
 'BTUNGSPH','DOLNGSPH','TEMPHOME','TOTCSQFT',
 'BTUFOSPH','AIRCOND','ELCOOL','GALLONFOSPH',
 'WALLTYPE','TOTALDOLOTH','BTUFO',
 'GALLONFO','DOLLARFO','KWHSPH',
 'BTUELSPH','FOWARM','USEFO','TOTUSQFT',
 'TOTALDOL','NUMTHERM','CONCRETE','CUFEETNG',
 'BTUNG','GALLONFOOTH','BTUFOOTH','BEDROOMS',
 'FOWATER','GALLONFOWTH','BTUFOWTH','DOLFOWTH',
 'GALLONKEROTH','BTUKEROTH','DOLKEROTH','TOTUCSQFT',
 'TOTSQFT','TOTSQFT_EN','DOLFOOTH',
 'ELOTHER','DOLLARNG','FUELHEAT','NOTMOIST',
 'GALLONKER','BTUKER','NUMCFAN','FOILAUX',
 'TOTALBTU','COOLTYPE','DOOR1SUM','DOLLARKER',
 'DOLLPOTH','DOLELWTH','KWHWTH','BTUELWTH',
 'TOTROOMS','ELWATER','GALLONKERSPH','BTUKERSPH',
 'TOTALBTUOTH','GALLONLPSPH','BTULPSPH','NCOMBATH',
 'OVEN','EQUIPM','STORIES','GALLONLP',
 'BTULP','LGT1','SOLWARM','FUELH2O',
 'GALLONLPOTH','BTULPOTH','TVCOLOR','SOLARAUX',
 'BTUNGOTH','DOLKERSPH','TOTALDOLWTH','DOLLPSPH',
 'USENG','YEARMADE','BTUELOTH','KWH',
 'BTUEL','DOLLARLP','BTUNGWTH','UGWATER',
 'SDESCENT','TEMPGONE','LGT1EE','TOTALBTUWTH',
 'ROOFTYPE','TOTALDOLRFG','HEATROOM',
 'WDWATER','UGWARM','DRYRFUEL', 'KWHRFG']

In [19]:
from pyspark.ml.feature import VectorAssembler
df_assembler = VectorAssembler(inputCols=input_features,outputCol='features')

In [20]:
df_cover_type = df_assembler.transform(df_study)
df_cover_type.printSchema()

root
 |-- Climate_Region_Pub: integer (nullable = true)
 |-- DIVISION: integer (nullable = true)
 |-- REPORTABLE_DOMAIN: integer (nullable = true)
 |-- DOLELCOL: string (nullable = true)
 |-- TOTALDOLCOL: integer (nullable = true)
 |-- KWHCOL: double (nullable = true)
 |-- BTUELCOL: double (nullable = true)
 |-- TOTALBTUCOL: integer (nullable = true)
 |-- TOTALDOLSPH: integer (nullable = true)
 |-- TOTALBTUSPH: integer (nullable = true)
 |-- CELLAR: integer (nullable = true)
 |-- NWEIGHT: double (nullable = true)
 |-- TOTHSQFT: integer (nullable = true)
 |-- HEATHOME: integer (nullable = true)
 |-- NUMPC: integer (nullable = true)
 |-- DOLLAREL: integer (nullable = true)
 |-- DOLELOTH: double (nullable = true)
 |-- CUFEETNGSPH: double (nullable = true)
 |-- BTUNGSPH: double (nullable = true)
 |-- DOLNGSPH: double (nullable = true)
 |-- TEMPHOME: integer (nullable = true)
 |-- TOTCSQFT: integer (nullable = true)
 |-- BTUFOSPH: double (nullable = true)
 |-- AIRCOND: integer (nullable = t

In [21]:
df_train,df_test = df_cover_type.randomSplit([0.75,0.25])
print('Обучающая выборка')
df_train.groupby('Climate_Region_Pub').count().show()
print('Тестовая выборка')
df_test.groupby('Climate_Region_Pub').count().show()

Обучающая выборка
+------------------+-----+
|Climate_Region_Pub|count|
+------------------+-----+
|                 1| 6749|
|                 0| 1436|
+------------------+-----+

Тестовая выборка
+------------------+-----+
|Climate_Region_Pub|count|
+------------------+-----+
|                 1| 2169|
|                 0|  521|
+------------------+-----+



In [22]:
#RandomForestClassifier
from pyspark.ml.classification import RandomForestClassifier
rf_model = RandomForestClassifier(labelCol='Climate_Region_Pub',numTrees=50).fit(df_train)

In [23]:
rf_predictions = rf_model.transform(df_test)

In [24]:
TP = rf_predictions[(rf_predictions['Climate_Region_Pub']==1)&(rf_predictions['prediction']==1)].count()
print('Доля истинно-положительных',TP)
TN = rf_predictions[(rf_predictions['Climate_Region_Pub']==0)&(rf_predictions['prediction']==0)].count()
print('Доля истинно-отрицательных',TN)
FP = rf_predictions[(rf_predictions['Climate_Region_Pub']==0)&(rf_predictions['prediction']==1)].count()
print('Доля ложно-положительных',FP)
FN = rf_predictions[(rf_predictions['Climate_Region_Pub']==1)&(rf_predictions['prediction']==0)].count()
print('Доля ложно-отрицательных',FN)


Доля истинно-положительных 2146
Доля истинно-отрицательных 367
Доля ложно-положительных 154
Доля ложно-отрицательных 23


In [25]:
#DecisionTreeClassifier
from pyspark.ml.classification import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(labelCol='Climate_Region_Pub').fit(df_train)
dt_predictions = dt_model.transform(df_test)

TP = dt_predictions[(dt_predictions['Climate_Region_Pub']==1)&(dt_predictions['prediction']==1)].count()
print('Доля истинно-положительных',TP)
TN = dt_predictions[(dt_predictions['Climate_Region_Pub']==0)&(dt_predictions['prediction']==0)].count()
print('Доля истинно-отрицательных',TN)
FP = dt_predictions[(dt_predictions['Climate_Region_Pub']==0)&(dt_predictions['prediction']==1)].count()
print('Доля ложно-положительных',FP)
FN = dt_predictions[(dt_predictions['Climate_Region_Pub']==1)&(dt_predictions['prediction']==0)].count()
print('Доля ложно-отрицательных',FN)

Доля истинно-положительных 2118
Доля истинно-отрицательных 449
Доля ложно-положительных 72
Доля ложно-отрицательных 51


In [26]:
#LogisticRegression
from pyspark.ml.classification import LogisticRegression
lr_model = LogisticRegression(labelCol='Climate_Region_Pub').fit(df_train)
lr_predictions = lr_model.transform(df_test)

TP = lr_predictions[(lr_predictions['Climate_Region_Pub']==1)&(lr_predictions['prediction']==1)].count()
print('Доля истинно-положительных',TP)
TN = lr_predictions[(lr_predictions['Climate_Region_Pub']==0)&(lr_predictions['prediction']==0)].count()
print('Доля истинно-отрицательных',TN)
FP = lr_predictions[(lr_predictions['Climate_Region_Pub']==0)&(lr_predictions['prediction']==1)].count()
print('Доля ложно-положительных',FP)
FN = lr_predictions[(lr_predictions['Climate_Region_Pub']==1)&(lr_predictions['prediction']==0)].count()
print('Доля ложно-отрицательных',FN)

Доля истинно-положительных 2114
Доля истинно-отрицательных 447
Доля ложно-положительных 74
Доля ложно-отрицательных 55
