<a href="https://colab.research.google.com/github/olalepek/Text-Mining---Agriculture/blob/main/Functions_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Function definitions


In [None]:
def pie_chart_characteristics_legend(characteristic, dataset):

    filtered_df = dataset[dataset[characteristic].apply(lambda x: len(x) > 0)]
    category_item = pd.Series([item for sublist in filtered_df[characteristic] for item in sublist])
    category_counts = category_item.value_counts()
    total = sum(category_counts)

    plt.figure(figsize=(10, 7))
    wedges, texts, autotexts = plt.pie(category_counts, labels=category_counts.index, startangle=140, autopct="")

    labels = [f'{label} - {count} ({(count/total)*100:.1f}%)' for label, count in zip(category_counts.index, category_counts)]

    plt.title(f'{characteristic} Distribution')

    plt.legend(wedges, labels, title=f'{characteristic} Summary', loc='center left', bbox_to_anchor=(1, 0.5))
    plt.show()

In [None]:
def find_matches_for_characteristics (characteristic,excel,sheet,column1,column2,dataset):
  xls = pd.ExcelFile(excel)
  print("List of all sheets in the excel file loaded: " + str(xls.sheet_names))
  matches_df = pd.read_excel(excel, sheet_name=sheet)
  characteristics = []
  characteristics = matches_df.iloc[:, 0].tolist()
  characteristics = [match.lower() for match in characteristics]
  print("List of characteristics " + str(characteristics))

  nlp = spacy.load("en_core_web_sm")
  matcher = PhraseMatcher(nlp.vocab)
  terms = characteristics
  patterns = [nlp.make_doc(text) for text in terms]
  matcher.add("TerminologyList", patterns)

  def find_matches(column1,column2):
      text = column1 + " " + column2

      doc = nlp(text)
      matches = matcher(doc)
      matched_terms = {doc[start:end].text for match_id, start, end in matches}
      return list(matched_terms)

  dataset[characteristic] =dataset.apply(lambda row: find_matches(row[column1], row[column2]), axis=1)

  return dataset.head(1)

In [None]:
def pie_chart_characteristics (characteristic,dataset):
  filtered_df = dataset[dataset[characteristic].apply(lambda x: len(x) > 0)]
  filtered_df.info()

  category_item = pd.Series([item for sublist in filtered_df[characteristic] for item in sublist])

  category_counts = category_item.value_counts()

  plt.figure(figsize=(10, 7))  # Adjust the size of the figure as needed
  plt.pie(category_counts, labels=category_counts.index, autopct='%1.1f%%', startangle=140)
  plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

  plt.title(characteristic + 'Distribution')

  # Show the pie chart
  return plt.show()

In [None]:
def country_map(column,dataset):
  category_item = pd.Series([item for sublist in dataset[column] for item in sublist])

  country_counts = category_item.value_counts().reset_index()
  country_counts.columns = ['Country', 'Counts']
  fig = px.choropleth(country_counts,
                    locations='Country',
                    locationmode='country names',
                    hover_name="Country",
                    hover_data={"Counts": True},
                    color='Counts',
                    color_continuous_scale=px.colors.sequential.Plasma,
                    title='Country Counts')
  return fig.show()


In [None]:
def patterns_from_excel (rules, rules_sheet):
  rules_df = pd.read_excel(rules, sheet_name=rules_sheet)
  rules_df.columns=["Label", "Pattern"]
  rules_pattern = [{"label": row["Label"], "pattern": row["Pattern"]} for index, row in rules_df.iterrows()]
  print("list of patterns identified ")
  return rules_pattern

In [None]:
def set_entity_ruler (nlp, rules, rules_sheet, patterns, dataset, column_to_analyze,output_col_name):

  rules_df = pd.read_excel(rules, sheet_name=rules_sheet)
  rules_df.columns=["Label", "Pattern"]
  rules_pattern = [{"label": row["Label"], "pattern": row["Pattern"]} for index, row in rules_df.iterrows()]
  print("list of patterns identified " + str(patterns))
  ruler = nlp.add_pipe("entity_ruler", before = "ner")
  ruler.add_patterns(rules_pattern)
  ruler.add_patterns(patterns)

  ruler_patterns = ruler.patterns

  dataset[output_col_name] = dataset[column_to_analyze].apply(lambda text: [(ent.text, ent.label_) for ent in nlp(text).ents])


  return dataset.head(1)

In [None]:
def matcher(text):
    doc = nlp(text)
    matches = matcher(doc)
    match_texts = []
    for match_id, token_ids in matches:
        match_text = ' '.join([doc[i].text for i in token_ids])
        match_texts.append(match_text)
    return match_texts


In [None]:
def regular_matcher(nlp, dataset, column_to_analyze, new_column, patterns, label):

    matcher = Matcher(nlp.vocab)
    for pattern_name, pattern in patterns.items():
        matcher.add(pattern_name, [pattern])

    def apply_matcher(text):
        doc = nlp(text)
        matches = matcher(doc)
        return [doc[start:end].text for _, start, end in matches]

    dataset[new_column] = dataset[column_to_analyze].apply(apply_matcher)
    return dataset

In [None]:
def filter_label(entity_label_list, interested_labels):
    unique_labels = {label for _, label in entity_label_list if label in interested_labels}
    return list(unique_labels)



In [None]:
def dependency_matcher(nlp, patterns, dataset, column_name_to_match, new_column):
    matcher = DependencyMatcher(nlp.vocab)
    for pattern_name, pattern in patterns.items():
      matcher.add("Pattern",[patterns])

    def apply_matcher(text):
        doc = nlp(text)
        matches = matcher(doc)
        match_texts = []
        for match_id, token_ids in matches:
            match_text = ' '.join(doc[i].text for i in token_ids)
            match_texts.append(match_text)
        return match_texts

    dataset[new_column] = dataset[column_name_to_match].apply(apply_matcher)
    return dataset.head(1)

In [None]:
def timeline_of_phrases_used_articles(dataframe, column_with_phrases, year_column, phrases_as_list):
    df_list = []

    for phrase in phrases_as_list:
        mask = dataframe[column_with_phrases].str.contains(phrase, case=False, na=False)
        df_filtered = dataframe[mask]
        counts_per_year = df_filtered.groupby(year_column).size().reset_index(name='article_count')
        counts_per_year['Term'] = phrase
        df_list.append(counts_per_year)
    combined_counts = pd.concat(df_list)

    fig = px.line(combined_counts, x=year_column, y='article_count', color='Term',
                  title='Article Count Over Years for Each Term',
                  markers=True)

    fig.update_layout(xaxis_title='Year', yaxis_title='Article Count', legend_title="Term")
    fig.show()



In [None]:
def count_values_non_empty_column(columns_of_interest, dataset):

  unique_counts = {}
  for column in columns_of_interest:
      # Count non-empty lists in each column
      non_empty_lists = dataset[column].apply(lambda x: len(x) > 0)
      unique_counts[column] = non_empty_lists.sum()
  return unique_counts


In [None]:
def pie_chart_column_count (unique_counts, title):

  labels = unique_counts.keys()
  sizes = unique_counts.values()

  # Plotting the pie chart
  fig, ax = plt.subplots()
  ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
  ax.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

  # Display the pie chart
  plt.title(title)
  plt.show()