# Time series prepper!
* this tool converts wide data (e.g. semesters as column headers) to long data (semesters as a column) and moves time variable to column 0.

In [1]:
import pandas as pd

In [2]:
df_wide = pd.read_csv('meltC.CSV')
print(df_wide)

    COURSE  SP21  WI22  FA22  SP22  WI23  FA23  SP23  WI24  FA24  SP24  WI25  \
0   ENG101    23    40    26    28    41    25    22    31    41    20    41   
1   ENG102    42    21    36    36    22    31    22    29    35    44    22   
2   ENG201    25    23    26    25    44    21    21    24    22    19    32   
3   ENG202    41    42    30    22    39    34    28    21    20    18    27   
4   ENG212    23    32    17    43    20    36    28    25    27    24    19   
5   ENG215    25    16    30    42    29    18    19    27    41    39    32   
6   ENG301    26    18    27    22    36    17    33    44    36    34    19   
7   ENG301    25    19    27    17    35    35    22    27    44    29    17   
8   ENG324    22    37    41    39    40    43    41    31    24    29    17   
9   ENG340    37    44    32    18    44    39    20    26    32    38    19   
10  ENG344    17    42    19    31    31    29    38    34    44    29    26   
11  ENG400    29    39    28    17    26

In [3]:
df_long = pd.melt(df_wide,
                  id_vars=['COURSE'],
                  value_vars=['SP21',	'WI22',	'FA22',	'SP22',	'WI23',	'FA23',	'SP23',	'WI24',	'FA24',	'SP24',	'WI25',	'SP25'],
                  var_name='SEMESTER',
                  value_name='ENROLLMENT')
print(df_long)                 

     COURSE SEMESTER  ENROLLMENT
0    ENG101     SP21          23
1    ENG102     SP21          42
2    ENG201     SP21          25
3    ENG202     SP21          41
4    ENG212     SP21          23
..      ...      ...         ...
163  ENG340     SP25          18
164  ENG344     SP25          23
165  ENG400     SP25          29
166  ENG450     SP25          26
167  ENG490     SP25          36

[168 rows x 3 columns]


In [5]:
#Map to actual dates --- making sure our time variable has correct dtype!
#For reference: chrome-extension://efaidnbmnnnibpcajpcglclefindmkaj/https://brightspotcdn.byui.edu/d1/49/8fac3c0a40729543e1a3e033dfc8/term-begin-end-dates.pdf
# *Some dates used are approximate

semester_to_date_map = {
    'SP21': '2021-04-19',
    'WI22': '2022-01-05',
    'FA22': '2022-09-12',
    'SP22': '2021-04-18',
    'WI23': '2022-01-04',
    'FA23': '2022-09-16',
    'SP23': '2021-04-19',
    'WI24': '2022-01-05',
    'FA24': '2022-09-12',
    'SP24': '2021-04-19',
    'WI25': '2022-01-05',
    'FA25': '2022-09-12',
    'SP25': '2021-04-19',
    'WI26': '2022-01-05',
    'FA26': '2022-09-12',
    'SP26': '2021-04-19',
    'WI27': '2022-01-05',
    'FA27': '2022-09-12',
    'SP27': '2021-04-19',
    'WI28': '2022-01-05',
    'FA28': '2022-09-12',
    'SP28': '2021-04-19',
    'WI29': '2022-01-05',
    'FA29': '2022-09-12',
    'SP29': '2021-04-19',
    'WI30': '2022-01-05',
    'FA30': '2022-09-12',
    'SP30': '2021-04-19',
    'WI31': '2022-01-05',
    'FA31': '2022-09-12',
    'SP31': '2021-04-19',
    'WI32': '2022-01-05',
    'FA32': '2022-09-12',
}

df_long['DATE'] = df_long['SEMESTER'].map(semester_to_date_map)
df_long['DATE'] = pd.to_datetime(df_long['DATE'])
df_long = df_long.drop(columns=['SEMESTER']) # Remove 'DATE' from its current position


print("\nFinal dataframe ready for time series analysis!:")
print(df_long)
print(df_long.info()) #Make sure 'SEMESTER' is now a time variable!


Final dataframe ready for time series analysis!:
     COURSE  ENROLLMENT       DATE
0    ENG101          23 2021-04-19
1    ENG102          42 2021-04-19
2    ENG201          25 2021-04-19
3    ENG202          41 2021-04-19
4    ENG212          23 2021-04-19
..      ...         ...        ...
163  ENG340          18 2021-04-19
164  ENG344          23 2021-04-19
165  ENG400          29 2021-04-19
166  ENG450          26 2021-04-19
167  ENG490          36 2021-04-19

[168 rows x 3 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168 entries, 0 to 167
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   COURSE      168 non-null    object        
 1   ENROLLMENT  168 non-null    int64         
 2   DATE        168 non-null    datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 4.1+ KB
None


In [6]:
#Moving the time variable to column position 0

cols = df_long.columns.tolist()
cols.remove('DATE') #Pick it up
cols.insert(0, 'DATE') #Drop it back at 0
df = df_long[cols] #Call it df ... our final usable df for time series work

print("\nFinal dataframe ready for time series analysis!:")
print(df)


Final dataframe ready for time series analysis!:
          DATE  COURSE  ENROLLMENT
0   2021-04-19  ENG101          23
1   2021-04-19  ENG102          42
2   2021-04-19  ENG201          25
3   2021-04-19  ENG202          41
4   2021-04-19  ENG212          23
..         ...     ...         ...
163 2021-04-19  ENG340          18
164 2021-04-19  ENG344          23
165 2021-04-19  ENG400          29
166 2021-04-19  ENG450          26
167 2021-04-19  ENG490          36

[168 rows x 3 columns]
