In [44]:
# https://geo-python.github.io/site/notebooks/L5/processing-data-with-pandas.html

import os
import pandas as pd
# import matplotlib.pyplot as plt
import numpy as np


# Текущая директория
ROOT_PATH = os.path.abspath(os.curdir)


In [45]:
# Путь к файлу Kumpula-June-2016-w-metadata.txt
file_name = "Kumpula-June-2016-w-metadata.txt"
DATASET = os.path.join(ROOT_PATH, "datasets\\") 
file_path = os.path.join(DATASET, file_name)

# чтение файла с пропуском 8ми первых строк
df = pd.read_csv(file_path, sep=',', skiprows=8)

In [46]:
# Содержимое файла данных
print(df.head())

   YEARMODA  TEMP   MAX   MIN
0  20160601  65.5  73.6  54.7
1  20160602  65.8  80.8  55.0
2  20160603  68.4   NaN  55.6
3  20160604  57.5  70.9  47.3
4  20160605  51.4  58.3  43.2


# Обработка данных

In [47]:
# новый столбец и значение по умолчанию 0.0
df['DIFF'] = 0.0
df.head()

Unnamed: 0,YEARMODA,TEMP,MAX,MIN,DIFF
0,20160601,65.5,73.6,54.7,0.0
1,20160602,65.8,80.8,55.0,0.0
2,20160603,68.4,,55.6,0.0
3,20160604,57.5,70.9,47.3,0.0
4,20160605,51.4,58.3,43.2,0.0


In [48]:
# тип данных в новой колонке
df['DIFF'].dtypes

dtype('float64')

In [49]:
# новый столбец и значение разность температур
df['DIFF'] = df["MAX"] - df["MIN"]
df.head()

Unnamed: 0,YEARMODA,TEMP,MAX,MIN,DIFF
0,20160601,65.5,73.6,54.7,18.9
1,20160602,65.8,80.8,55.0,25.8
2,20160603,68.4,,55.6,
3,20160604,57.5,70.9,47.3,23.6
4,20160605,51.4,58.3,43.2,15.1


In [50]:
# новый столбец и значение разность температур
df['DIFF_MIN'] = df['TEMP'] - df['MIN']
df.head()

Unnamed: 0,YEARMODA,TEMP,MAX,MIN,DIFF,DIFF_MIN
0,20160601,65.5,73.6,54.7,18.9,10.8
1,20160602,65.8,80.8,55.0,25.8,10.8
2,20160603,68.4,,55.6,,12.8
3,20160604,57.5,70.9,47.3,23.6,10.2
4,20160605,51.4,58.3,43.2,15.1,8.2


In [51]:
# Общая информация о данных таблицы df.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   YEARMODA  30 non-null     int64  
 1   TEMP      30 non-null     float64
 2   MAX       27 non-null     float64
 3   MIN       28 non-null     float64
 4   DIFF      25 non-null     float64
 5   DIFF_MIN  28 non-null     float64
dtypes: float64(5), int64(1)
memory usage: 1.5 KB


In [52]:
# Создайте новый столбец и преобразуйте temp fahrenheit в celsius:
df['TEMP_CELSIUS'] = (df['TEMP'] - 32) / (9/5)

#Проверьте вывод
print(df.head())


   YEARMODA  TEMP   MAX   MIN  DIFF  DIFF_MIN  TEMP_CELSIUS
0  20160601  65.5  73.6  54.7  18.9      10.8     18.611111
1  20160602  65.8  80.8  55.0  25.8      10.8     18.777778
2  20160603  68.4   NaN  55.6   NaN      12.8     20.222222
3  20160604  57.5  70.9  47.3  23.6      10.2     14.166667
4  20160605  51.4  58.3  43.2  15.1       8.2     10.777778


## ЗАДАЧА:

In [53]:
# Создайте новый столбец и преобразуйте temp цельсий в кельвин:
df["TEMP_KELVIN"] = df['TEMP_CELSIUS'] - 273.15 

#Проверьте вывод
print(df.head())


   YEARMODA  TEMP   MAX   MIN  DIFF  DIFF_MIN  TEMP_CELSIUS  TEMP_KELVIN
0  20160601  65.5  73.6  54.7  18.9      10.8     18.611111  -254.538889
1  20160602  65.8  80.8  55.0  25.8      10.8     18.777778  -254.372222
2  20160603  68.4   NaN  55.6   NaN      12.8     20.222222  -252.927778
3  20160604  57.5  70.9  47.3  23.6      10.2     14.166667  -258.983333
4  20160605  51.4  58.3  43.2  15.1       8.2     10.777778  -262.372222


# Выбор строк и столбцов:

- Выбор нескольких строк:

In [54]:
# Выберите первые пять строк фрейма данных, используя значения Индекса
selection = df[1:3]

print(selection)


   YEARMODA  TEMP   MAX   MIN  DIFF  DIFF_MIN  TEMP_CELSIUS  TEMP_KELVIN
1  20160602  65.8  80.8  55.0  25.8      10.8     18.777778  -254.372222
2  20160603  68.4   NaN  55.6   NaN      12.8     20.222222  -252.927778


- Выбор нескольких строк и столбцов:

In [55]:
# выборка по четырём строкам и колонке "DIFF"
selection = df.loc[1:4, "DIFF"]
print(selection)

1    25.8
2     NaN
3    23.6
4    15.1
Name: DIFF, dtype: float64


In [56]:
# выборка по четырём строкам и колонкам "MAX" "MIN" "DIFF"
selection = df.loc[1:4, ["MAX", "MIN", "DIFF"]]
print(selection)

    MAX   MIN  DIFF
1  80.8  55.0  25.8
2   NaN  55.6   NaN
3  70.9  47.3  23.6
4  58.3  43.2  15.1


- Выбор по одной строке:

In [57]:
# Выберите одну строку, используя index
selection = df.loc[4]

print(selection)


YEARMODA        2.016060e+07
TEMP            5.140000e+01
MAX             5.830000e+01
MIN             4.320000e+01
DIFF            1.510000e+01
DIFF_MIN        8.200000e+00
TEMP_CELSIUS    1.077778e+01
TEMP_KELVIN    -2.623722e+02
Name: 4, dtype: float64


- Выбор по одной колонке из одной строки:

In [58]:
# Выберите одну колонку
print(selection["TEMP"])

51.4


## ЗАДАЧА: 
Средняя температура за последние семь дней июня (используйте индексацию loc для выбора правильных строк):

In [59]:
# вариант без loc
df_temp = df[-7:]
print(df_temp["TEMP"])
print(df_temp["TEMP"].mean())

23    61.1
24    65.7
25    69.6
26    60.7
27    65.4
28    65.8
29    65.7
Name: TEMP, dtype: float64
64.85714285714286


In [60]:
# вариант с loc
df_temp = df.loc[23:]
print(df_temp["TEMP"])
print(df_temp["TEMP"].mean())

23    61.1
24    65.7
25    69.6
26    60.7
27    65.4
28    65.8
29    65.7
Name: TEMP, dtype: float64
64.85714285714286


In [61]:
# индексы целые числа
df_temp = df.iloc[-7:, 1:2]
print(df_temp.mean())

TEMP    64.857143
dtype: float64


# Фильтрация и обновление данных

In [62]:
# Выберите строки с температурой Цельсия выше 15 градусов с конца июня 2016 года
warm_temps = df.loc[(df["TEMP_CELSIUS"] > 15)&(df["YEARMODA"] >= 20160616)]
print(warm_temps)

    YEARMODA  TEMP   MAX   MIN  DIFF  DIFF_MIN  TEMP_CELSIUS  TEMP_KELVIN
16  20160617  60.4  70.7  55.9  14.8       4.5     15.777778  -257.372222
19  20160620  59.3  69.1  52.2  16.9       7.1     15.166667  -257.983333
20  20160621  62.6  71.4  50.4  21.0      12.2     17.000000  -256.150000
21  20160622  61.7  70.2  55.4  14.8       6.3     16.500000  -256.650000
22  20160623  60.9  67.1  54.9  12.2       6.0     16.055556  -257.094444
23  20160624  61.1  68.9  56.7  12.2       4.4     16.166667  -256.983333
24  20160625  65.7  75.4  57.9  17.5       7.8     18.722222  -254.427778
25  20160626  69.6  77.7  60.3  17.4       9.3     20.888889  -252.261111
26  20160627  60.7  70.0   NaN   NaN       NaN     15.944444  -257.205556
27  20160628  65.4  73.0  55.8  17.2       9.6     18.555556  -254.594444
28  20160629  65.8  73.2   NaN   NaN       NaN     18.777778  -254.372222
29  20160630  65.7  72.7  59.2  13.5       6.5     18.722222  -254.427778


## ЗАДАЧА:
Найдите средние температуры (в градусах Цельсия) за последние семь дней июня. Теперь выберите строки, основанные на условии для YEARMODAстолбца!

In [91]:
# Средняя температура за последние семь дней июня (используйте условный оператор для выбора правильных строк):
warm_temps = df.loc[df["YEARMODA"] >= 20160623]
print(warm_temps["TEMP_CELSIUS"].mean())
print(warm_temps["TEMP_CELSIUS"].mean().round(2))

17.979166666666668
17.98


# Работа с недостающими данными:

- Удаление всех сток с NaN

In [64]:
warm_temps = df.loc[(df["TEMP_CELSIUS"] > 15)&(df["YEARMODA"] >= 20160616)]
print(warm_temps)

    YEARMODA  TEMP   MAX   MIN  DIFF  DIFF_MIN  TEMP_CELSIUS  TEMP_KELVIN
16  20160617  60.4  70.7  55.9  14.8       4.5     15.777778  -257.372222
19  20160620  59.3  69.1  52.2  16.9       7.1     15.166667  -257.983333
20  20160621  62.6  71.4  50.4  21.0      12.2     17.000000  -256.150000
21  20160622  61.7  70.2  55.4  14.8       6.3     16.500000  -256.650000
22  20160623  60.9  67.1  54.9  12.2       6.0     16.055556  -257.094444
23  20160624  61.1  68.9  56.7  12.2       4.4     16.166667  -256.983333
24  20160625  65.7  75.4  57.9  17.5       7.8     18.722222  -254.427778
25  20160626  69.6  77.7  60.3  17.4       9.3     20.888889  -252.261111
26  20160627  60.7  70.0   NaN   NaN       NaN     15.944444  -257.205556
27  20160628  65.4  73.0  55.8  17.2       9.6     18.555556  -254.594444
28  20160629  65.8  73.2   NaN   NaN       NaN     18.777778  -254.372222
29  20160630  65.7  72.7  59.2  13.5       6.5     18.722222  -254.427778


In [65]:
# Reset index
warm_temps1 = warm_temps.reset_index(drop=True)
print(warm_temps1)

    YEARMODA  TEMP   MAX   MIN  DIFF  DIFF_MIN  TEMP_CELSIUS  TEMP_KELVIN
0   20160617  60.4  70.7  55.9  14.8       4.5     15.777778  -257.372222
1   20160620  59.3  69.1  52.2  16.9       7.1     15.166667  -257.983333
2   20160621  62.6  71.4  50.4  21.0      12.2     17.000000  -256.150000
3   20160622  61.7  70.2  55.4  14.8       6.3     16.500000  -256.650000
4   20160623  60.9  67.1  54.9  12.2       6.0     16.055556  -257.094444
5   20160624  61.1  68.9  56.7  12.2       4.4     16.166667  -256.983333
6   20160625  65.7  75.4  57.9  17.5       7.8     18.722222  -254.427778
7   20160626  69.6  77.7  60.3  17.4       9.3     20.888889  -252.261111
8   20160627  60.7  70.0   NaN   NaN       NaN     15.944444  -257.205556
9   20160628  65.4  73.0  55.8  17.2       9.6     18.555556  -254.594444
10  20160629  65.8  73.2   NaN   NaN       NaN     18.777778  -254.372222
11  20160630  65.7  72.7  59.2  13.5       6.5     18.722222  -254.427778


In [74]:
# наличие элементов NaN в структурах
warm_temps1.isnull()

Unnamed: 0,YEARMODA,TEMP,MAX,MIN,DIFF,DIFF_MIN,TEMP_CELSIUS,TEMP_KELVIN
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False
8,False,False,False,True,True,True,False,False
9,False,False,False,False,False,False,False,False


In [75]:
# количество элементов NaN в структурах по столбцам
warm_temps1.isnull().sum()

YEARMODA        0
TEMP            0
MAX             0
MIN             2
DIFF            2
DIFF_MIN        2
TEMP_CELSIUS    0
TEMP_KELVIN     0
dtype: int64

In [80]:
# удалятся строки 8 и 10
df_rez = warm_temps1.dropna(subset=["MIN"])
df_rez
# print(warm_temps)

Unnamed: 0,YEARMODA,TEMP,MAX,MIN,DIFF,DIFF_MIN,TEMP_CELSIUS,TEMP_KELVIN
0,20160617,60.4,70.7,55.9,14.8,4.5,15.777778,-257.372222
1,20160620,59.3,69.1,52.2,16.9,7.1,15.166667,-257.983333
2,20160621,62.6,71.4,50.4,21.0,12.2,17.0,-256.15
3,20160622,61.7,70.2,55.4,14.8,6.3,16.5,-256.65
4,20160623,60.9,67.1,54.9,12.2,6.0,16.055556,-257.094444
5,20160624,61.1,68.9,56.7,12.2,4.4,16.166667,-256.983333
6,20160625,65.7,75.4,57.9,17.5,7.8,18.722222,-254.427778
7,20160626,69.6,77.7,60.3,17.4,9.3,20.888889,-252.261111
9,20160628,65.4,73.0,55.8,17.2,9.6,18.555556,-254.594444
11,20160630,65.7,72.7,59.2,13.5,6.5,18.722222,-254.427778


In [81]:
# проверка количества элементов NaN в структурах по столбцам
df_rez.isnull().sum()

YEARMODA        0
TEMP            0
MAX             0
MIN             0
DIFF            0
DIFF_MIN        0
TEMP_CELSIUS    0
TEMP_KELVIN     0
dtype: int64

 - Удаление столбцов с NaN

In [82]:
warm_temps2 = warm_temps.reset_index(drop=True)
warm_temps2

Unnamed: 0,YEARMODA,TEMP,MAX,MIN,DIFF,DIFF_MIN,TEMP_CELSIUS,TEMP_KELVIN
0,20160617,60.4,70.7,55.9,14.8,4.5,15.777778,-257.372222
1,20160620,59.3,69.1,52.2,16.9,7.1,15.166667,-257.983333
2,20160621,62.6,71.4,50.4,21.0,12.2,17.0,-256.15
3,20160622,61.7,70.2,55.4,14.8,6.3,16.5,-256.65
4,20160623,60.9,67.1,54.9,12.2,6.0,16.055556,-257.094444
5,20160624,61.1,68.9,56.7,12.2,4.4,16.166667,-256.983333
6,20160625,65.7,75.4,57.9,17.5,7.8,18.722222,-254.427778
7,20160626,69.6,77.7,60.3,17.4,9.3,20.888889,-252.261111
8,20160627,60.7,70.0,,,,15.944444,-257.205556
9,20160628,65.4,73.0,55.8,17.2,9.6,18.555556,-254.594444


In [84]:
df_rez = warm_temps2.dropna(axis=1)
df_rez

Unnamed: 0,YEARMODA,TEMP,MAX,TEMP_CELSIUS,TEMP_KELVIN
0,20160617,60.4,70.7,15.777778,-257.372222
1,20160620,59.3,69.1,15.166667,-257.983333
2,20160621,62.6,71.4,17.0,-256.15
3,20160622,61.7,70.2,16.5,-256.65
4,20160623,60.9,67.1,16.055556,-257.094444
5,20160624,61.1,68.9,16.166667,-256.983333
6,20160625,65.7,75.4,18.722222,-254.427778
7,20160626,69.6,77.7,20.888889,-252.261111
8,20160627,60.7,70.0,15.944444,-257.205556
9,20160628,65.4,73.0,18.555556,-254.594444


- Заполнение

In [85]:
warm_temps3 = warm_temps.reset_index(drop=True)
warm_temps3

Unnamed: 0,YEARMODA,TEMP,MAX,MIN,DIFF,DIFF_MIN,TEMP_CELSIUS,TEMP_KELVIN
0,20160617,60.4,70.7,55.9,14.8,4.5,15.777778,-257.372222
1,20160620,59.3,69.1,52.2,16.9,7.1,15.166667,-257.983333
2,20160621,62.6,71.4,50.4,21.0,12.2,17.0,-256.15
3,20160622,61.7,70.2,55.4,14.8,6.3,16.5,-256.65
4,20160623,60.9,67.1,54.9,12.2,6.0,16.055556,-257.094444
5,20160624,61.1,68.9,56.7,12.2,4.4,16.166667,-256.983333
6,20160625,65.7,75.4,57.9,17.5,7.8,18.722222,-254.427778
7,20160626,69.6,77.7,60.3,17.4,9.3,20.888889,-252.261111
8,20160627,60.7,70.0,,,,15.944444,-257.205556
9,20160628,65.4,73.0,55.8,17.2,9.6,18.555556,-254.594444


In [87]:
df_rez = warm_temps3.fillna(-9999.0)
df_rez

Unnamed: 0,YEARMODA,TEMP,MAX,MIN,DIFF,DIFF_MIN,TEMP_CELSIUS,TEMP_KELVIN
0,20160617,60.4,70.7,55.9,14.8,4.5,15.777778,-257.372222
1,20160620,59.3,69.1,52.2,16.9,7.1,15.166667,-257.983333
2,20160621,62.6,71.4,50.4,21.0,12.2,17.0,-256.15
3,20160622,61.7,70.2,55.4,14.8,6.3,16.5,-256.65
4,20160623,60.9,67.1,54.9,12.2,6.0,16.055556,-257.094444
5,20160624,61.1,68.9,56.7,12.2,4.4,16.166667,-256.983333
6,20160625,65.7,75.4,57.9,17.5,7.8,18.722222,-254.427778
7,20160626,69.6,77.7,60.3,17.4,9.3,20.888889,-252.261111
8,20160627,60.7,70.0,-9999.0,-9999.0,-9999.0,15.944444,-257.205556
9,20160628,65.4,73.0,55.8,17.2,9.6,18.555556,-254.594444


# Преобразование типов данных:

In [93]:
print("исходные значения:")
print(df['TEMP'].head())

исходные значения:
0    65.5
1    65.8
2    68.4
3    57.5
4    51.4
Name: TEMP, dtype: float64


In [95]:
print("усеченные целочисленные значения:")
print(df['TEMP'].astype(int).head())

усеченные целочисленные значения:
0    65
1    65
2    68
3    57
4    51
Name: TEMP, dtype: int32


In [97]:
# ТАК ПРАВИЛЬНО
print("округленные целочисленные значения:")
print(df['TEMP'].round(0).astype(int).head())

округленные целочисленные значения:
0    66
1    66
2    68
3    58
4    51
Name: TEMP, dtype: int32


## Уникальные значения:

In [99]:
df["TEMP"].unique()

array([65.5, 65.8, 68.4, 57.5, 51.4, 52.2, 56.9, 54.2, 49.4, 49.5, 54. ,
       55.4, 58.3, 59.7, 63.4, 57.8, 60.4, 57.3, 56.3, 59.3, 62.6, 61.7,
       60.9, 61.1, 65.7, 69.6, 60.7, 65.4])

## Сортировка данных:

In [101]:
# сортировка по колонке (по умолчанию - по возрастанию)
df.sort_values(by="TEMP").head()

Unnamed: 0,YEARMODA,TEMP,MAX,MIN,DIFF,DIFF_MIN,TEMP_CELSIUS,TEMP_KELVIN
8,20160609,49.4,54.1,45.7,8.4,3.7,9.666667,-263.483333
9,20160610,49.5,55.9,43.0,12.9,6.5,9.722222,-263.427778
4,20160605,51.4,58.3,43.2,15.1,8.2,10.777778,-262.372222
5,20160606,52.2,59.7,42.8,16.9,9.4,11.222222,-261.927778
10,20160611,54.0,62.1,41.7,20.4,12.3,12.222222,-260.927778


In [103]:
# сортировка по колонке (по убыванию)
df.sort_values(by="TEMP", ascending=False).head()

Unnamed: 0,YEARMODA,TEMP,MAX,MIN,DIFF,DIFF_MIN,TEMP_CELSIUS,TEMP_KELVIN
25,20160626,69.6,77.7,60.3,17.4,9.3,20.888889,-252.261111
2,20160603,68.4,,55.6,,12.8,20.222222,-252.927778
1,20160602,65.8,80.8,55.0,25.8,10.8,18.777778,-254.372222
28,20160629,65.8,73.2,,,,18.777778,-254.372222
29,20160630,65.7,72.7,59.2,13.5,6.5,18.722222,-254.427778


# Запись данных в файл:

In [105]:
# define output filename
output_fp = "Kumpula_temps_June_2016.csv"
df.to_csv(output_fp, sep=',')

In [106]:
# фрейм данных без индекса и только с 1 десятичной дробью в числах с плавающей запятой
output_fp = "Kumpula_temps_above15_June_2016.csv"
df.to_csv(output_fp, sep=',', index=False, float_format='%.1f')