In [89]:
import os
import pandas as pd
import re
import requests
import zipfile
import collections

try:
    from collections import OrderedDict
except ImportError:
    OrderedDict = dict

In [90]:
df = pd.read_excel('assignment1/response.xlsx')
df['id'] = df['Thư điện tử'].str[:8]
df['folder'] = df.agg('{0[id]}_{0[Họ và đệm]} {0[Tên]}'.format, axis=1)
df = df.sort_values(by=['Được hoàn thành'], ascending=False)

In [91]:
def extract_id(response):
    if type(response) != str:
        return None
    pos = response.find('open?id=')
    if pos == -1:
        pos = response.find('/file/d/')
    if pos != -1:
        pos += 8
        response = response[pos:]
        result = re.search(r"[\w-]+", response)
        return result.group(0)
    else:
        return None

def download_file_from_google_drive(id, destination, replace=False):
    if not replace and os.path.isfile(destination):
        print("Destination file", destination, "exists, download aborted")
        return
    URL = "https://docs.google.com/uc?export=download"

    session = requests.Session()

    response = session.get(URL, params = { 'id' : id }, stream = True)
    token = get_confirm_token(response)

    if token:
        params = { 'id' : id, 'confirm' : token }
        response = session.get(URL, params = params, stream = True)

    save_response_content(response, destination)    

def get_confirm_token(response):
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            return value

    return None

def save_response_content(response, destination):
    CHUNK_SIZE = 32768

    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)
                       
def download_response(row):
    folder = row['folder']
    response = row['Response 1']    
    data_file_id = extract_id(response)
    if data_file_id is not None:
        destination_dir = os.path.join('assignment1', folder)
        destination = os.path.join(destination_dir, 'data.zip')
        os.makedirs(destination_dir, exist_ok=True)
        print("Downloading", data_file_id, '...')
        download_file_from_google_drive(data_file_id, destination)
        
        print("-> Done. Start Unzipping")
        unzip(destination, os.path.join(destination_dir, 'unzip'))
        print("--> Unzip done.")
        
        print("---> Start iterate through all TEXT files")
        find_txt(os.path.join(destination_dir, 'unzip'))
        
    else:
        print("Cannot find drive file ID, check response")

def unzip(zip_path, folder_path):    
    os.makedirs(folder_path, exist_ok=True)
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(folder_path)


counts = dict() # Global dictionary

def word_count(str):
    words = str.split()

    for word in words:
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1

    return counts

def open_txt(txt_path):
    f = open(txt_path, "r")
    f.readline() # Skip First line contain url
    line_number = 0
    for x in f:
        if (x.find(".wav")==-1):
            word_count(x)
    
def find_txt(unzip_path):
    for root, dirs, files in os.walk(unzip_path):
        for file in files:
            if ((file.endswith(".txt")) and (root.find("__MACOSX")==-1)): # Since mac OS create annoying directories
#                     print("\n\n")
#                     print("Found This:") 
#                     print(os.path.join(root, file))
#                     print("---> Reading this txt and append to global dictionary")
                    open_txt(os.path.join(root, file))
#                     print("---> Reading this file done!")

        
def download_by_student_id(student_id):
    for r in df.iterrows():
        r = r[1]
        if r['id'] == student_id:
            print(r['folder'], r['Được hoàn thành'], r['Response 1'])
            download_response(r)
            print('--------------------------------------------------------')
            print("Dictionary: ")        
        # break # comment this line to process all assignments


In [92]:
students = ['17021186',
'17021187',
'17021191',
'17020709',
# '17021182', This student drive is invalid!!!!!
# '17021059',
# '15021837',
# '17020970',
# '17021180',
# '17021201',
# '17020646',
# '15020971',
# '17021089',
# '16020046',
# '17021272',
# '17021351',
# '16020220',
# '17021305',
# '16020196',
# '16020237',
# '17021350',
# '17020045',
# '17021209',
# '16020216',
# '16020191',
# '17021352',
# '17021263',
# '16020190',
# '17021311',
# '17021194',
# '17021288',
# '17021184',
# '16020062',
# '17021200',
# '17021185',
# '17021203',
# '17021237',
# '16020199',
# '17021210',
# '17020173',
# '16020247',
# '17021227',
# '17021183',
# '16020272',
# '17021347',
# '17021225',
# '17020019',
# '17020103',
# '17021216',
# '17021339',
# '16021399',
# '16022148',
# '17021344',
# '16020236',
# '17021276',
# '17021231',
# '17021236',
# '17021346',
# '17021268',
# '17021295',
# '16022494',
# '17021230',
# '17020173',
# '16020047',
# '17020173',
# '17020042',
# '17020173',
# '17021205',
# '17021291',
# '17021195',
# '17021200',
# '17021263',
# '16022480',
# '17021019',
# '17021272',
# '16022145',
# '17021197',
# '16020203',
# '17020035',
# '17021350',
# '17021184',
# '17021259',
# '17021195',
# '17021285',
# '17020028',
# '17020039',
# '17021203',
# '17021185',
# '17021311',
# '17021191',
# '17021192',
# '16020047',
# '17021352',
# '17021238',
# '17020709',
# '16020219',
# '17021211',
# '16020216',
# '16020216',
# '17020042',
# '17021347',
# '17020053',
# '17021246',
# '17021183',
# '16020047',
# '16020062',
# '16020062',
# '17021346',
# '17021346',
# '16020203',
# '17021228',
# '17021213',
# '17021350',
# '17021191',
# '17021305',
# '17021311',
# '17020103',
# '16020236',
# '16021398',
# '17021194',
# '16021398',
# '17020709',
# '17021059',
# '17021197',
# '17021276',
# '17020709',
# '17020709',
# '17021184',
# '16022494',
# '16021398',
# '17021230',
# '17021182',
# '17020616',
# '17020709',
# '17021186',
# '17020019',
# '16020247',
# '17021195',
# '17020616',
# '17021182',
# '17021059',
# '17021182',
# '17021182',
# '17021187',
# '17021192',
# '17020709',
# '17021183',
# '17021187',
# '17021311',
# '17020020',
# '17020019',
# '17020035',
# '17020035',
# '17020035',
# '17021089',
# '17021089',
# '16021399',
# '16021399',
# '17020709',
# '17020646',
# '17020173',
# '16020272',
# '17021350',
# '16020236',
# '16020272',
# '17021246',
# '17021237',
# '17021089',
# '17021225',
# '17021200',
# '17021200',
# '15021837',
# '17021228',
# '17021305',
# '17021182',
# '17020045',
# '16020191',
# '17020020',
# '17021305',
# '16020199',
# '17021291',
# '16020196',
# '17021227',
# '17021209',
# '16020220',
# '16020196',
# '17021237',
# '16020272',
# '17021237',
# '17021230',
# '17021276',
# '16020047',
# '17021209',
# '17021213',
# '16020191',
# '17021205',
# '17021227',
# '16020235',
# '17021305',
# '17021345',
# '16020219',
# '16020219',
# '17021276',
# '17021276',
# '17021276',
# '17021276',
# '17021347',
# '17021230',
# '17021305',
# '17021230',
# '17021305',
# '16020220',
# '16020203',
# '17021230',
# '17021339',
# '17021339',
# '17021347',
# '17021347',
# '17021230',
# '17021291',
# '17021237',
# '16022480',
# '17021288',
# '17021231',
# '17021231',
# '17021187',
# '17021187',
# '17021187',
# '17021187',
# '17021187',
# '17020709',
# '17020042',
# '16022403',
# '16022403',
# '17021311',
# '17020709',
# '16020235',
# '17021259',
# '17021210',
# '17021213',
# '17020039',
# '17021238',
# '16020273',
# '16020273',
# '17021344',
# '17021280',
# '16021424',
# '17020036',
# '17021201',
# '16022145',
# '17021272',
# '17021244',
# '17020028',
# '17020616',
# '17021180',
# '17021192',
# '17021191',
# '17021186',
# '16022146',
# '17021203',
# '15020971',
# '17021246',
# '16020237',
# '17021183',
# '17020053',
# '16020062',
# '16020196',
# '17021209',
# '17021276',
# '16020191',
# '16020247',
# '17021195',
# '17020103',
# '16022403',
# '16020220',
# '17021231',
# '17021295',
# '17021309',
# '16022494',
# '16022148',
# '17021182',
# '17020019',
# '17021210',
# '16020272',
# '16020273',
'17021019']

for student in students:
    download_by_student_id(student)

result = OrderedDict(sorted(counts.items(), key=lambda t: t[1], reverse=True))

print(result)

17021186_Nguyễn Thị Phương Đông 31 Tháng ba 2020  11:19 PM Link project:

https://github.com/phuongdongbn/Speech-and-Language-Processing

Link data: 

https://drive.google.com/file/d/1rSlCR1hgmaFxsu2hrQG3tlGJWRCzRFgQ/view?usp=sharing
Downloading 1rSlCR1hgmaFxsu2hrQG3tlGJWRCzRFgQ ...
Destination file assignment1/17021186_Nguyễn Thị Phương Đông/data.zip exists, download aborted
-> Done. Start Unzipping
--> Unzip done.
---> Start iterate through all TEXT files
--------------------------------------------------------
Dictionary: 
17021186_Nguyễn Thị Phương Đông 25 Tháng ba 2020  10:30 AM https://github.com/phuongdongbn/Speech-and-Language-Processing
Cannot find drive file ID, check response
--------------------------------------------------------
Dictionary: 
17021186_Nguyễn Thị Phương Đông - -
Cannot find drive file ID, check response
--------------------------------------------------------
Dictionary: 
17021187_Phạm Minh Đức 4 Tháng tư 2020  9:13 PM 1. https://github.com/duconline09/Audi

--> Unzip done.
---> Start iterate through all TEXT files
--------------------------------------------------------
Dictionary: 
17020709_Dương Thu Hà 1 Tháng tư 2020  12:35 AM Code: https://github.com/duongghaa/speechProcessing/tree/master/Tuan1_thudulieu

Data: https://drive.google.com/file/d/1VuMoKIlfNMttHSXH8bOyrWg9_cpFZdwo/view?usp=sharing
Downloading 1VuMoKIlfNMttHSXH8bOyrWg9_cpFZdwo ...
Destination file assignment1/17020709_Dương Thu Hà/data.zip exists, download aborted
-> Done. Start Unzipping
--> Unzip done.
---> Start iterate through all TEXT files
--------------------------------------------------------
Dictionary: 
17021019_Phạm Sơn Thành 21 Tháng tư 2020  8:37 PM https://github.com/phamsonthanh99/speech_processing
Cannot find drive file ID, check response
--------------------------------------------------------
Dictionary: 
17021019_Phạm Sơn Thành - -
Cannot find drive file ID, check response
--------------------------------------------------------
Dictionary: 
OrderedDict(