In [2]:
from datetime import datetime, timedelta

import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

In [3]:
def get_data(URL):
    res = requests.get(URL)
    soup = BeautifulSoup(res.content, 'html.parser')
    rows = soup.find_all('div', attrs={'class': 'row mb-2'})
    result_list = []
    for row in rows:
        accepted = row.find('span', attrs={'class': 'badge badge-green'})
        if accepted is not None:
            accepted_text = accepted.text.strip()
            # remove new line
            accepted_text = accepted_text.replace('\n', ' ')
            # remove excess space in the middle
            accepted_text = ' '.join(accepted_text.split())
            acceptance_date = '-'.join(accepted_text.split(' ')[-2:])
            result_list.append({
                'date': acceptance_date,
                'status': 'accepted'
            })
        else:
            rejected = row.find('span', attrs={'class': 'badge badge-red'})
            if rejected is not None:
                rejected_text = rejected.text.strip()
                # remove new line
                rejected_text = rejected_text.replace('\n', ' ')
                # remove excess space in the middle
                rejected_text = ' '.join(rejected_text.split())
                rejection_date = '-'.join(rejected_text.split(' ')[-2:])
                result_list.append({
                    'date': rejection_date,
                    'status': 'rejected'
                })

    return result_list

In [4]:
all_result_list = []
for _page in tqdm(range(1, 5)):
    try:
        URL = f"https://www.thegradcafe.com/survey/?institution=University+of+California%2C+San+Diego&program=Computer+Science&degree=Masters&season=F22&page={_page}&per_page=40"
        result_list = get_data(URL)
        all_result_list.extend(result_list)
    except:
        pass

  0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
df = pd.DataFrame(all_result_list)

In [6]:
df

Unnamed: 0,date,status
0,9-Apr,accepted
1,9-Apr,accepted
2,9-Apr,accepted
3,9-Apr,accepted
4,5-Apr,accepted
...,...,...
73,17-Feb,accepted
74,17-Feb,accepted
75,11-Feb,accepted
76,12-Feb,accepted


In [7]:
# convert date to datetime
# date if form of '9-Apr', '17-Feb' and year is 2022
df['date'] = pd.to_datetime(df['date'] + '-2022', format='%d-%b-%Y')

In [8]:
# limit df upto 30th april
df = df[df['date'] <= datetime(2022, 4, 30)] 

In [9]:
df_22 = df.copy()

In [10]:
import plotly.graph_objects as go

In [11]:
# create a figure
fig = go.Figure()

# make bar plot, x is date, y is count of accepts and rejects on that date
fig.add_trace(go.Bar(x=df[df['status'] == 'accepted']['date'].value_counts().index,
                        y=df[df['status'] == 'accepted']['date'].value_counts().values, 
                        name='Accepted'))

fig.add_trace(go.Bar(x=df[df['status'] == 'rejected']['date'].value_counts().index,
                        y=df[df['status'] == 'rejected']['date'].value_counts().values,                     
                        name='Rejected'))

# add vertical dotted line to indicate todays date but with previous year
fig.add_shape(type="line", x0=datetime.now() - timedelta(days=365), y0=0, x1=datetime.now() - timedelta(days=365), y1=max(df['date'].value_counts().values),
               line=dict(color="LightSeaGreen", width=4, dash="dot"))


fig.show()

## FOR CURRENT YEAR

In [12]:
curr_list = []
URL = "https://www.thegradcafe.com/survey/?per_page=20&q=&institution=University+of+California%2C+San+Diego&program=Computer+Science&degree=Masters"
result_list = get_data(URL)
curr_list.extend(result_list)

In [13]:
df = pd.DataFrame(curr_list)

df['date'] = pd.to_datetime(df['date'] + '-2023', format='%d-%b-%Y')

# remove entries with date greater than today(dd-mm)
df = df[df['date'] <= datetime(2023, datetime.now().month, datetime.now().day)]
df

Unnamed: 0,date,status
0,2023-03-02,accepted
1,2023-03-02,accepted
2,2023-03-01,accepted
3,2023-02-18,accepted
4,2023-03-01,accepted
5,2023-02-16,accepted
6,2023-02-24,accepted
7,2023-02-23,accepted
8,2023-02-15,accepted
9,2023-02-17,accepted


In [14]:
# create a figure
fig_curr = go.Figure()

# make bar plot, x is date, y is count of accepts and rejects on that date
fig_curr.add_trace(go.Bar(x=df[df['status'] == 'accepted']['date'].value_counts().index,
                        y=df[df['status'] == 'accepted']['date'].value_counts().values,                             
                        name='Accepted'))

fig_curr.add_trace(go.Bar(x=df[df['status'] == 'rejected']['date'].value_counts().index,
                        y=df[df['status'] == 'rejected']['date'].value_counts().values,                         
                        name='Rejected'))       

# add vertical dotted line to indicate todays date
fig_curr.add_shape(type="line", x0=datetime.now(), y0=0, x1=datetime.now(), y1=max(df['date'].value_counts().values),
                line=dict(color="LightSeaGreen", width=4, dash="dot"))

In [15]:
fig.show()
fig_curr.show()

In [16]:
df_22['status'].value_counts()

rejected    43
accepted    34
Name: status, dtype: int64

In [17]:
df['status'].value_counts()

accepted    14
Name: status, dtype: int64