In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import django
from django.db.models import Q

In [2]:
os.chdir("..")
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'attendance.settings')
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true" 
django.setup()

In [3]:
from daily_attendance.models import DailyAttendance

In [4]:
df_attendance = pd.read_csv("notebooks/2018-2019_Daily_Attendance.csv",
                            parse_dates=["Date"])

In [5]:
df_attendance.head()

Unnamed: 0,School DBN,Date,Enrolled,Absent,Present,Released
0,01M015,2018-09-05,172,19,153,0
1,01M015,2018-09-06,171,17,154,0
2,01M015,2018-09-07,172,14,158,0
3,01M015,2018-09-12,173,7,166,0
4,01M015,2018-09-13,173,9,164,0


In [6]:
df_attendance=df_attendance.rename(
    columns={"School DBN": "dbn", "Date": "date", "Enrolled":"enrolled",
    "Absent": "absent", "Present": "present", "Released": "released"})

In [7]:
df_attendance.head()

Unnamed: 0,dbn,date,enrolled,absent,present,released
0,01M015,2018-09-05,172,19,153,0
1,01M015,2018-09-06,171,17,154,0
2,01M015,2018-09-07,172,14,158,0
3,01M015,2018-09-12,173,7,166,0
4,01M015,2018-09-13,173,9,164,0


## bulk_create

The following line insert all records in the database, batch_size = 100.

In [None]:
new_daily_attendance = DailyAttendance.objects.bulk_create(
    (DailyAttendance(**vals) for vals in df_attendance.to_dict('records')),
    batch_size=100)

## Sync data

Once you have a dataframe with the data coming from any source, for example a csv file, the next step is to get what you have in the database.
You must define some conditions to query the data in the db and save it in a dataframe.

I am going to split the source dataframe because my database is on lisa, and it is a little bit slow. Also, I want to explain how to filter the data in the queries.

In [8]:
# Splitting the source df
df_attendance_split = np.array_split(df_attendance, 10)

  return bound(*args, **kwds)


In [9]:
df_attendance_split[0]

Unnamed: 0,dbn,date,enrolled,absent,present,released
0,01M015,2018-09-05,172,19,153,0
1,01M015,2018-09-06,171,17,154,0
2,01M015,2018-09-07,172,14,158,0
3,01M015,2018-09-12,173,7,166,0
4,01M015,2018-09-13,173,9,164,0
...,...,...,...,...,...,...
27711,03M199,2018-09-13,815,17,798,0
27712,03M199,2018-09-14,815,18,797,0
27713,03M199,2018-09-17,817,20,797,0
27714,03M199,2018-09-18,817,22,795,0


The next step is to get the data from the database. Notice that I need to filter the data. I splitted the source dataframe, and I am using the first set of that data df_attendance_split[0], so I need to look only for those records. We can use Q objects.


## Q objects

A Q object (django.db.models.Q) is an object used to encapsulate a collection of keyword arguments.

For instance, complex queries to filter data using AND/OR statements could be done by using Q objects.

Add the following line in your imports section:

```from django.db.models import Q```

In [10]:
# dbn and date are my fields in the database, you must use the names you gave to those columns

filter_list = Q()
for index, row in df_attendance_split[0].iterrows():
    filter_list.add(
        (Q(dbn=row.dbn) &
         Q(date=row.date)),
         Q.OR
    )

In [14]:
# df with data coming from the db

df_db = pd.DataFrame.from_records(
                    DailyAttendance.objects.filter(filter_list).values())

In [15]:
df_db.head()

Unnamed: 0,id,dbn,date,enrolled,absent,present,released
0,554327,01M015,2018-09-14,173,11,162,0
1,554328,01M015,2018-09-17,173,10,163,0
2,554329,01M015,2018-09-18,174,7,167,0
3,554330,01M015,2018-09-20,174,7,167,0
4,554331,01M015,2018-09-21,174,8,166,0


In [16]:
df_db.dtypes

id           int64
dbn         object
date        object
enrolled     int64
absent       int64
present      int64
released     int64
dtype: object

In [17]:
df_db["date"] = pd.to_datetime(df_db["date"])

In [18]:
df_db.dtypes

id                   int64
dbn                 object
date        datetime64[ns]
enrolled             int64
absent               int64
present              int64
released             int64
dtype: object

Now, we have to merge the the two dfs (source and database) to compare the data and find out if there are new records to insert into the database or records to update.

Please work on Part 2 of this exercise https://github.com/novillo-cs/softdev_material/blob/main/classwork/unit_7/06_django_db/attendance.md

In [23]:
# Why do you think we use suffixes?

df_join = df_attendance.merge(
                    df_db, how="left",
                    on=["dbn", "date"],
                    suffixes=('', "_db"))

In [24]:
# What are we doing here?

df_join["equal"] = ((df_join["enrolled"] == df_join["enrolled_db"]) &
                     (df_join["absent"] == df_join["absent_db"]) &
                     (df_join["present"] == df_join["present_db"]) &
                     (df_join["released"] == df_join["released_db"]))
df_join["exist"] = (df_join["id"].isnull() == False)

In [25]:
df_join.head()

Unnamed: 0,dbn,date,enrolled,absent,present,released,id,enrolled_db,absent_db,present_db,released_db,equal,exist
0,01M015,2018-09-05,172,19,153,0,831475.0,172.0,19.0,153.0,0.0,True,True
1,01M015,2018-09-06,171,17,154,0,554323.0,171.0,17.0,154.0,0.0,True,True
2,01M015,2018-09-07,172,14,158,0,554324.0,172.0,14.0,158.0,0.0,True,True
3,01M015,2018-09-12,173,7,166,0,554325.0,173.0,7.0,166.0,0.0,True,True
4,01M015,2018-09-13,173,9,164,0,554326.0,173.0,9.0,164.0,0.0,True,True
