# RELABEL EXISTING QUESTION TOPIC WITH A MORE APPROPRIATE TOPIC

## Import Libraries

In [1]:
import pandas as pd
from collections import Counter

## Import Data

In [2]:
question_topic_valid = pd.read_csv('../data/question_topic_valid.csv')

## Relabel Question Topic Version 1

There are some topics in 'question_topic' where the topic is so general that it could apply to multiple other topics. In other words it is a generic category. For example there is the category 'bean' and there are also categories 'castor-bean', 'french-bean_list' and 'mung-bean'. Additionally, there are examples where a generic term like 'plant' instead of a much more specific topic such as 'tomato' could have been used based on the 'question_content'

The code updates the question topic, where required, by doing the following:
- Checks if the generic topic appears in the question content and if not looks for a better category from the words in the content
- For generic topics that share their name with other more specific topics, checks if any specific names appear in the content and updates to the more appropriate question topic

### Assumptions

1. In many cases a question topic was labelled with a generic topic when a more specific topic could have been used.
2. The original intention for the question topic of 'plant' refers to the act of planting rather than a general classification for all types of plants.
3. If the currently assigned 'question_topic' appears as a word in 'question_content' then the assignment is correct. The exception to this is where the 'question_topic' is more general and forms part of the name of a more descriptive 'question_topic' where its full name also appears in the 'question_content'. For example, if the question_topic is 'bean' but the question_content includes the words 'castor' & 'bean' then the 'question_topic' should be changed to 'castor-bean'.
4. While 'mushroom' is technically not a plant it has been included with other plant groups.
5. While 'locust' is technically an animal it has been included with the other plant groups as it is a pest that affects certain crops.
6. 'bee' refers to bee farms rather than pollinating plants.
7. If the original 'question_topic' refers to any general animal group then it can only be replaced by another more specific animal 'question_topic' type.
8. If the original 'question_topic' refers to any plant group then it can only be replaced by another more specific plant 'question_topic' type.
9. Questions that are not in English will not always contain the words which match with the 'question_topic' so will be skipped for relabelling.

### Create list for each question topic adding plurals and/or other derivatives.

NB:
- corriander should be coriander so including both spellings in list
- cyprus should be cypress so including both spellings in list
- lucern should be lucerne so including both spellings in list
- macademia should be macadamia so including both spellings in list
- some plurals added are technically not proper plurals but are commonly used in the question content

In [3]:
question_topic_valid.question_topic.unique()

array(['cattle', 'cat', 'poultry', 'rabbit', 'pig', 'coconut', 'plant',
       'tomato', 'animal', 'potato', 'watermelon', 'coffee', 'onion',
       'chicken', 'rice', 'maize', 'napier-grass', 'tree', 'sheep',
       'passion-fruit', 'cassava', 'pigeon', 'banana', 'kale', 'bean',
       'wheat', 'cereal', 'carrot', 'cabbage', 'crop', 'spinach',
       'turkey', 'peach', 'goat', 'paw-paw', 'butternut-squash', 'acacia',
       'livestock', 'sweet-potato', 'greens', 'pumpkin', 'millet', 'bird',
       'plantain', 'olive', 'vegetable', 'tobacco', 'sugar-cane', 'bee',
       'avocado', 'pineapple', 'beetroot', 'melon', 'dog', 'guava',
       'capsicum', 'miraa', 'grass', 'mango', 'macademia', 'sesame',
       'pear', 'jackfruit', 'cowpea', 'nightshade', 'cotton',
       'guinea-fowl', 'flax', 'apple', 'fish', 'tea', 'cocoa', 'garlic',
       'duck', 'sunflower', 'orange', 'peanut', 'soya', 'squash',
       'tilapia', 'safflower', 'mushroom', 'collard-greens',
       'french-bean', 'mung-bea

### Create Dictionaries to hold Alternate Words for Topics in 'question_topic'

In [4]:
# Split animal topics between generic topics and non-generic topics
animal_generic_topics = {'animal': {'animal', 'animals'}, 'bird': {'bird','birds'}, 'fish': {'fish','fishes'}, 'poultry': {'poultry','poultries'}} 
animal_non_generic = {'bee': {'bee','bees'}, 'camel': {'camel','camels'}, 'cat': {'cat','cats'}, 'cattle': {'cattle','cow','cows'},
                      'chicken': {'chicken','chickens'}, 'dog': {'dog','dogs'}, 'duck': {'duck','ducks'}, 'goat': {'goat','goats'},
                      'guinea-fowl': {'guinea-fowl','guineafowl','guineafowls','fowl','fowls'},
                      'guinea-pig': {'guinea-pig','guineapig','guineapigs'}, 'livestock': {'livestock','livestocks'},
                      'ostrich': {'ostrich','ostriches'}, 'pig': {'pig','pigs'}, 'pigeon': {'pigeon','pigeons'},
                      'rabbit': {'rabbit','rabbits'}, 'sheep': {'sheep','sheeps'}, 'tilapia': {'tilapia','tilapias '},
                      'turkey': {'turkey','turkeys'}
                         }
                          

# Split plant topics between generic topics and non-generic topics
plant_generic_topics = {'bean': {'bean','beans'}, 'crop': {'crop','crops'}, 'grass': {'grass','grasses'}, 'greens': {'greens'},
                        'millet': {'millet','millets'}, 'nightshade': {'nightshade','nightshades'}, 'pea': {'pea','peas'},
                        'plant': {'plant','plants','planting'}, 'potato': {'potato','potatoes','potatos'}, 'squash': {'squash','squashes'},
                        'tree': {'tree','trees'}, 'vegetable': {'vegetable','vegetable'}, 'vetch': {'vetch','vetches'}
                       }                      
plant_non_generic = {'acacia': {'acacia','acacias'},
                     'african-nightshade': {'african-nightshade', 'africannightshade','africannightshades','african', 'africans'},
                     'amaranth': {'amaranth','amaranths'}, 'apple': {'apple','apples'}, 'apricot': {'apricot','apricots'},
                     'asparagus': {'asparagus'}, 'aubergine': {'aubergine','aubergines'}, 'avocado': {'avocado','avocados'},
                     'bamboo': {'bamboo','bamboos'}, 'banana': {'banana','bananas'}, 'barley': {'barley','barleys'},
                     'beetroot': {'beetroot','beetroots'}, 'blackberry': {'blackberry','blackberrys','blackberries'},
                     'black-nightshade': {'black-nightshade','blacknighshade','blacknighshades'},
                     'boma-rhodes': {'boma-rhodes','bomarhode','bomarhodes','boma','bomas','rhode','rhodes'},
                     'brachiaria-grass': {'brachiaria-grass','brachiariagrass','brachiariagrasses','brachiaria','brachiarias'},
                     'broccoli': {'broccoli','broccolis'}, 'butternut-squash': {'butternut-squash','butternutsquash','butternut','butternuts'},
                     'cabbage': {'cabbage','cabbages'}, 'caliandra': {'caliandra','caliandra'}, 'capsicum': {'capsicum','capsicums'},
                     'carrot': {'carrot','carrots'}, 'cashew-nut': {'cashew-nut','cashewnut','cashewnuts','cashew','cashews'},
                     'cassava': {'cassava','cassava'}, 'castor-bean': {'castor-bean','castorbean','castorbeans','castor','castors'},
                     'cauliflower': {'cauliflower','cauliflowers'}, 'celery': {'celery','celeries','celerys'}, 'cereal': {'cereal','cereals '},
                     'chard': {'chard','chards'}, 'chia': {'chia','chias'}, 'chickpea': {'chickpea','chickpeas'},
                     'chilli': {'chilli','chillies','chillis'}, 'clover': {'clover','clovers'}, 'cocoa': {'cocoa','cocoas'},
                     'coconut': {'coconut','coconuts'}, 'coffee': {'coffee','coffees'},
                     'collard-greens': {'collard-greens','collardgreens','collardgreen','collard','collards'},
                     'corriander': {'corriander','coriander','corianders','corriander'}, 'cotton': {'cotton','cottons'},
                     'courgette': {'courgette','courgettes'}, 'cowpea': {'cowpea','cowpeas'}, 'cranberry': {'cranberry','cranberries','cranberrys'},
                     'cucumber': {'cucumber','cucumber'}, 'cyprus': {'cyprus','cypress','cypresses','cyprusses'},
                     'desmodium': {'desmodium','desmodiums'}, 'eucalyptus': {'eucalyptus''eucalyptuses','eucalypti'},
                     'finger-millet': {'finger-millet','fingermillet','fingermillets'}, 'flax': {'flax','flaxes','flaxs'},
                     'french-bean': {'french-bean','frenchbean','frenchbeans','french'}, 'garlic': {'garlic','garlics'},
                     'ginger': {'ginger','gingers'}, 'gooseberry': {'gooseberry','gooseberries','gooseberrys'}, 'grape': {'grape','grapes'},
                     'guava': {'guava','guavas'}, 'jackfruit': {'jackfruit','jackfruits','jack'}, 'kale': {'kale','kales'}, 'leek': {'leek','leeks'},
                     'lemon': {'lemon','lemons'}, 'lettuce': {'lettuce','lettuces'}, 'leucaena': {'leucaena','leucaenas'},
                     'locust': {'locust','locusts'}, 'lucern': {'lucern','lucerne','lucernes','lucerns'}, 'lupin': {'lupin','lupins'},
                     'macademia': {'macademia','macadamia','macadamias','macademias'}, 'maize': {'maize'}, 'mango': {'mango','mangoes','mangos'},
                     'melon': {'melon','melons'}, 'miraa': {'miraa','miraas','khat','khats'}, 'mulberry': {'mulberry','mulberries','mulberrys'},
                     'mung-bean': {'mung-bean','mungbean','mungbeans','mung'}, 'mushroom': {'mushroom','mushrooms'},
                     'napier-grass': {'napier-grass','napiergrass','napiergrasses','napier','napiers'}, 'oat': {'oat','oats'},
                     'okra': {'okra','okras'}, 'olive': {'olive','olives'}, 'onion': {'onion','onions'}, 'orange': {'orange','oranges'},
                     'parsley': {'parsley','parsleys'}, 'passion-fruit': {'passion-fruit','passionfruit','passionfruits'},
                     'paw-paw': {'paw-paw','pawpaw','pawpaws,','paw','paws'}, 'peach': {'peach','peaches'}, 'peanut': {'peanut','peanuts'},
                     'pear': {'pear','pears'}, 'pigeon-pea': {'pigeon-pea','pigeonpea','pigeonpeas'}, 'pineapple': {'pineapple','pineapples'},
                     'plantain': {'plantain','plantains'}, 'pumpkin': {'pumpkin','pumpkins'},
                     'purple-vetch': {'purple-vetch','purplevetch','purplevetches'}, 'pyrethrum': {'pyrethrum','pyrethrums'},
                     'radish': {'radish','radishes'}, 'rapeseed': {'rapeseed','rapeseeds'}, 'rice': {'rice','rices'}, 'rye': {'rye','ryes'},
                     'safflower': {'safflower','safflowers'}, 'sesame': {'sesame','sesames '},
                     'setaria': {'setaria','setariagrass','setarias','setariasgrass'}, 'sisal': {'sisal','sisals'},
                     'snap-pea': {'snap-pea','snappea','snappeas','snap'}, 'snow-pea': {'snow-pea','snowpea','snowpeas','snow'},
                     'soya': {'soya','soyas','soyabean','soyabeans'}, 'spinach': {'spinach','spinaches'}, 
                     'strawberry': {'strawberry','strawberries','strawberry'}, 'sudan-grass': {'sudan-grass','sudangrass','sudan-grasses','sudan'},
                     'sugar-cane': {'sugar-cane','sugarcane','sugarcanes','sugar','cane'}, 'sunflower': {'sunflower','sunflowers'},
                     'sweet-potato': {'sweet-potato','sweetpotatoes','sweetpotatos'}, 'taro': {'taro','taros'}, 'tea': {'tea','teas'},
                     'tobacco': {'tobacco','tobaccos'}, 'tomato': {'tomato','tomatoes','tomatos'}, 'watermelon': {'watermelon','watermelons'},
                     'wheat': {'wheat','wheats'}, 'yam': {'yam','yams'}
                    }


all_generic_topics = {**animal_generic_topics, **plant_generic_topics}

### Make Dictionaries for Replacing Generic Question Topics With More Specific Topics

In most cases the more specific names are hyphenated in 'question_topic' where they often appear as 2 separate words in 'question_content'

The lists for the more specific question topics, that are hyphenated, have been extended to cater for the 2 words appearing separately or joined together. 

In [5]:
# Used to check if 'question_content' includes any of these generic topics
generic_topics_shared = {'bean': {'bean','beans'}, 'grass': {'grass','grasses'}, 'greens': {'greens'},
                         'millet': {'millet','millets'}, 'nightshade': {'nightshade','nightshades'}, 'pea': {'pea','peas'},
                         'potato': {'potato','potatoes','potatos'}, 'squash': {'squash', 'squashes'}, 'vetch': {'vetch','vetches'}
                        }

# Alternatives to the generic value if any of the values exist in the 'question_content'
all_bean_types = {'castor-bean': {'castor-bean','castorbean','castorbeans','castor','castors'},
                  'french-bean': {'french-bean','frenchbean','frenchbeans','french'},
                  'mung-bean': {'mung-bean','mungbean','mungbeans','mung'}
                 }
all_grass_types = {'boma-rhodes': {'boma-rhodes','bomarhode','bomarhodes','boma','bomas','rhode','rhodes'},
                   'brachiaria-grass': {'brachiaria-grass','brachiariagrass','brachiariagrasses','brachiaria','brachiarias'},
                   'napier-grass': {'napier-grass','napiergrass','napiergrasses','napier','napiers'},
                   'setaria': {'setaria','setariagrass','setarias','setariasgrass'}, 'sisal': {'sisal','sisals'},
                   'sudan-grass': {'sudan-grass','sudangrass','sudan-grasses','sudan'}
                  }
all_greens_types = {'collard-greens': {'collard-greens','collardgreens','collardgreen','collard','collards'}}
all_millet_types = {'finger-millet': {'finger-millet','fingermillet','fingermillets'}}
all_nightshade_types = {'african-nightshade': {'african-nightshade', 'africannightshade','africannightshades','african', 'africans'},
                        'black-nightshade': {'black-nightshade','blacknighshade','blacknighshades'}
                       }
all_pea_types = {'chickpea': {'chickpea','chickpeas'}, 'cowpea': {'cowpea','cowpeas'}, 'pigeon-pea': {'pigeon-pea','pigeonpea','pigeonpeas'},
                 'snap-pea': {'snap-pea','snappea','snappeas','snap'}, 'snow-pea': {'snow-pea','snowpea','snowpeas','snow'}
                }
all_potato_types = {'sweet-potato': {'sweet-potato','sweetpotatoes','sweetpotatos'}}
all_squash_types = {'butternut-squash': {'butternut-squash','butternutsquash','butternut','butternuts'}}
all_vetch_types = {'purple-vetch': {'purple-vetch','purplevetch','purplevetches'}}

In [6]:
def update_question_topic(question_topic, question_content_list, topic_dicts):
    """
    Finds a more accurate question topic than the one originally assigned by
    matching with words in the question content. This includes any hyphenated
    question topic that appears as two words in the question content.

    If multiple matches occur for different question topics then the one with
    the highest occurence in the question content will be chosen. If the highest
    counts are the same then the question topic accessed first in the frequency
    dictionary will be chosen.

    If no matches are found then the originally assigned question topic is returned.
    """
    frequency = {} # Count number of times the topic or its derivations from the same list appear in the question
    word_counts = Counter(question_content_list)
    count = 0
    for keys, values in topic_dicts.items():
        for value in values:
            if value in word_counts.keys():
                count += word_counts[value]
        if count > 0:
            frequency[keys] = count
            count = 0
            question_topic = max(frequency, key=frequency.get)
    return question_topic

In [None]:
# Check each record for generic topics assigned to 'question_topic' and update if a better topic exists
for i,v in question_topic_valid.iterrows():
    content_list = v.question_content.lower().split() # Extract 'question_content' and convert to list of words
    question_topic = v.question_topic # Extract currently assigned 'question_topic'
    language = v.question_language

    # Only update records with a generic name for 'question_topic'
    if question_topic in all_generic_topics.keys() and language == 'eng':
        animal_non_generic_check = set.union(*animal_non_generic.values()).intersection(set(content_list))
        plant_non_generic_check = set.union(*plant_non_generic.values()).intersection(set(content_list))
        # If the topic does not exist in the question content then look for a topic that does
        if not all_generic_topics[question_topic].intersection(set(content_list)):
            # Only replace animal related topics with other animal related topics
            if question_topic in animal_generic_topics.keys() and animal_non_generic_check:
                question_topic_valid.iloc[i,4] = update_question_topic(question_topic, content_list, animal_non_generic)
            # Only replace plant related topics with other plant related topics
            elif question_topic in plant_generic_topics.keys() and plant_non_generic_check:
                question_topic_valid.iloc[i,4] = update_question_topic(question_topic, content_list, plant_non_generic)
        # Topic exists in the question content but only update generic topics where the name exists as part of a more specific topic
        elif question_topic in plant_generic_topics.keys():
            qt_all_values = generic_topics_shared[question_topic]
            all_qt_types = eval('all_'+question_topic+'_types.values()')
            if set.union(qt_all_values).intersection(set(content_list)) and set.union(*all_qt_types).intersection(set(content_list)):
                question_topic_valid.iloc[i,4] = update_question_topic(question_topic, content_list, eval('all_'+question_topic+'_types'))

### Export To CSV File

In [None]:
# Export version of dataset before relabelling the 'question_topic' column
question_topic_valid.to_csv('../data/question_topic_valid_relabel_r1.csv',index=False)

# Free Memory For Next Step

NB: Optional step if system resources are limited

In [7]:
# %xdel question_topic_valid

## Relabel Question Topic Version 2

The relabelling for this version is more agressive. It replaces all generic topics even if these topics exist as words in the question content. If a more specific topic exists in the question content then the record will be update to the more specific topic. The only exception is that animal topics will not be replaced by plant topics and vice-versa.

Jackfruit and passionfruit are 2 items that can appear as 1 word or 2. Although, fruit is not a 'question_topic', if any question content contains 'jack' and 'fruit' or 'passion' and 'fruit' then they will be updated to 'jackfruit' and 'passion-fruit' respectively if not already correctly labelled.

### Assumptions

1. In many cases a question topic was labelled with a generic topic when a more specific topic could have been used.
2. While 'mushroom' is technically not a plant it has been included with other plant groups.
3. While 'locust' is technically an animal it has been included with the other plant groups as it is a pest that affects certain crops.
4. 'bee' refers to bee farms rather than pollinating plants.
5. That even if a more specific topic could have been used, if a generic animal topic was recorded then it should only be replaced by a more specific animal topic.
6. That even if a more specific topic could have been used, if the generic plavt topic was used then it should only be replaced by a more specific plant topic.
7. Questions that are not in English will not always contain the words which match with the 'question_topic' so will be skipped for relabelling.

In [8]:
# Re-import version of dataset with no missing values before version 1 labelling performed
question_topic_valid_v2 = pd.read_csv('../data/question_topic_valid.csv')

In [None]:
# Create dictionary for jackfruit and passionfruit
fruits_dict = {'jackfruit': {'jackfruit','jackfruits','jack'},
               'passion-fruit': {'passion-fruit','passionfruit','passionfruits', 'passion'}
              }
# Check each record for generic topics assigned to 'question_topic' and update if a better topic exists
for i,v in question_topic_valid_v2.iterrows():
    content_list = v.question_content.lower().split() # Extract 'question_content' and convert to list of words
    question_topic = v.question_topic
    language = v.question_language    
    jf_pf_content_check = set.union(*fruits_dict.values()).intersection(set(content_list)) # Check for matches with jackfruit or passionfruit

    # Update generic topics with specific topics including content with 'fruit' except if there are matches to jackfruit or passionfruit
    if question_topic in all_generic_topics.keys() and language == 'eng' and not jf_pf_content_check:
        animal_non_generic_check = set.union(*animal_non_generic.values()).intersection(set(content_list)) # Check for matches with non-generic animals
        plant_non_generic_check = set.union(*plant_non_generic.values()).intersection(set(content_list)) # Check for matches with non-generic plants
        # Only replace animal related topics with other animal related topics
        if question_topic in animal_generic_topics.keys() and animal_non_generic_check:
            question_topic_valid_v2.iloc[i,4] = update_question_topic(question_topic, content_list, animal_non_generic)
        # Only replace plant related topics with other plant related topics
        elif question_topic in plant_generic_topics.keys() and plant_non_generic_check:
            question_topic_valid_v2.iloc[i,4] = update_question_topic(question_topic, content_list, plant_non_generic)
    # Look for question content that has either 'jackfuit' or 'passionfruit' split into 2 words and update if it has any generic plant topic assigned
    elif question_topic in plant_generic_topics.keys() and language == 'eng' and 'fruit' in content_list:
        if jf_pf_content_check:
            question_topic_valid_v2.iloc[i,4] = update_question_topic(question_topic, content_list, fruits_dict)

### Export To CSV File

In [None]:
# Export version of dataset before relabelling the 'question_topic' column
question_topic_valid_v2.to_csv('../data/question_topic_valid_relabel_r2.csv',index=False)

## Suggestions
- If the translation to English of the languages 'lug', 'nyn' and 'swa' is successful than this code can be extended for these records too.