## import libraries

In [1]:
import os #to import all files in directory
import docx #to read docx files
import pandas as pd
import numpy as np
import time #to have today's date
timestr = time.strftime("%Y%m%d")

## import interviews

In [2]:
#P: sometimes a paragraph is just empty, e.g. ''. Remove those!
def delete_paragraph(paragraph):
    p = paragraph._element
    p.getparent().remove(p)
    #p._p = p._element = None
    paragraph._p = paragraph._element = None

In [3]:
#get all interviews from all folders!
#code from: https://pythonguides.com/python-get-all-files-in-directory/
def import_transcripts():
    path = '.\\transcripts'
    list_of_files = []
    dict_all_interviews_w_q = {}
    for root, dirs, files in os.walk(path):
        for file in files:
            list_of_files.append(os.path.join(root,file))
            short_name = file.replace(".docx","")
            file_x = docx.Document(os.path.join(root,file))
            dict_all_interviews_w_q[short_name] = file_x #each interview as a docx file, named as in the folder
    #sometimes a paragraph is empty, delete it!
    for x in dict_all_interviews_w_q:
        for y in dict_all_interviews_w_q[x].paragraphs:
            if y.text in ['', ' ', '  ', '   ']:
                delete_paragraph(y)
    return(dict_all_interviews_w_q)

In [4]:
#get dict with the questions (from Colombia onwards)
dict_all_interviews_w_q = import_transcripts()

## prepare dataframe with all documents

In [5]:
#prepare dataframe with all the questions ('Question') in one column and the replies ('Text') in another
df_questions_replies = pd.DataFrame(columns={'Question', 'Text'})
for x in dict_all_interviews_w_q:
    for y in range(0, len(dict_all_interviews_w_q[x].paragraphs)):
        var_text_1 = dict_all_interviews_w_q[x].paragraphs[y].text
        if var_text_1 not in set(df_questions_replies['Text']): #so if it has not been added with a question yet
            if '<q/start>' in var_text_1: #if it is a question
                var_text_2 = dict_all_interviews_w_q[x].paragraphs[y+1].text
                row = {'Question':var_text_1, 'Text':var_text_2}
                df_questions_replies = df_questions_replies.append(row, ignore_index=True)
            else: #otherwise (so it is a reply)
                row = {'Text':var_text_1}
                df_questions_replies = df_questions_replies.append(row, ignore_index=True)
df_questions_replies = df_questions_replies[['Question', 'Text']]
#if 'index out of range'-->last paragraph is question. Change in word

In [6]:
#find the respective interview for text
def return_interview(some_text):
    for var_interview in dict_all_interviews_w_q:
        for z in dict_all_interviews_w_q[var_interview].paragraphs:
            if z.text == some_text:
                return(var_interview, z.text)

In [7]:
#check if something went wrong! Print the respective transcripts and the text
for x in df_questions_replies.index:
    var_q = df_questions_replies.loc[x, 'Question']
    var_text = df_questions_replies.loc[x, 'Text']
    
    #if the question is not missing?
    if pd.isnull(var_q) == False:
        #is only either <q/start> or <q/end> in Question?
        if "<q/start>" in var_q:
            if  "<q/end>" not in var_q:
                var_interview, var_text = return_interview(var_q)
                print("only <q/start> in Question: " + str(var_interview)+': '+var_text+'\n')
        if "<q/end>" in var_q:
            if  "<q/start>" not in var_q:
                var_interview, var_text = return_interview(var_q)
                print("only <q/end> in Question: " + str(var_interview)+': '+var_text+'\n')
    
    #if we have two missing questions in a row
    if pd.isnull(var_q) == True:
        if pd.isnull(df_questions_replies.loc[x+1, 'Question']) == True:
            var_interview, var_text = return_interview(var_text) #with var_text
            print(">2 consecutive Questions are empty: " + str(var_interview)+': '+var_text+'\n')
    
    #is <q/start> or <q/end> in column Text?
    if any (x in var_text for x in ["<q/start>", "<q/end>"]): #or pd.isna(var_q): 
        var_interview, var_text = return_interview(var_text)
        print("<q/end> or <q/start> in var_text: " + str(var_interview)+': '+var_text+'\n')
    

In [8]:
#remove the charcter for new paragraphs in questions
for x in df_questions_replies.index:
    if type(df_questions_replies.loc[x, 'Question']) == str:
        df_questions_replies.loc[x, 'Question'] = df_questions_replies.loc[x,'Question'].replace('\n', '')

In [9]:
df_questions_replies.to_csv('.//results//topics_group//questions_replies_'+timestr+'_V01.csv',
    sep=';', decimal=',')
#when exported, '…' is turned into 'â€¦'

In [10]:
#do a dict with the questions combined with the answers:
#drop the paragraphs that have the answer only.
dict_all_interviews_qa = dict_all_interviews_w_q.copy()#P: paragrahs are dropped in all dicts!
for x in dict_all_interviews_qa:
    var_len_x = len(dict_all_interviews_qa[x].paragraphs)
    y = 0
    while y < var_len_x:
        var_text_1 = dict_all_interviews_qa[x].paragraphs[y].text
        z = y+1
        if '<q/start>' in var_text_1:
            var_text_2 = dict_all_interviews_qa[x].paragraphs[z].text
            if '<q/start>' not in var_text_2:
                var_text_qa = var_text_1 + ' ' + var_text_2
                #put a space (' ') after start and before end. So I can drop it in stopwords
                var_text_qa = var_text_qa.replace('<q/start>', '<q/start> ').replace('<q/end>', ' <q/end>')
                #replace RE with renewable energy & REs with renewable energies
                var_text_qa = var_text_qa.replace(" RE ", " renewable energy ").replace(' REs ', ' renewable energies ').replace(" RE's ", " renewable energies ")
                dict_all_interviews_qa[x].paragraphs[y].text = var_text_qa
                delete_paragraph(dict_all_interviews_qa[x].paragraphs[z])
                var_len_x = len(dict_all_interviews_qa[x].paragraphs)
        y=z

In [11]:
#get dict with the questions (from Colombia onwards)
dict_all_interviews_q_a = import_transcripts()